## Importar bibliotecas y archivos

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
# float format
pd.options.display.float_format = '{:.4f}'.format

In [3]:
# datasets
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

## Explorar información

In [4]:
test.shape, train.shape

((76161, 16), (306007, 17))

In [5]:
train.columns

Index(['year', 'month', 'day', 'hour', 'PM2.5', 'PM10', 'SO2', 'NO2', 'CO',
       'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'wd', 'WSPM', 'station'],
      dtype='object')

In [6]:
test.columns

Index(['year', 'month', 'day', 'hour', 'PM10', 'SO2', 'NO2', 'CO', 'O3',
       'TEMP', 'PRES', 'DEWP', 'RAIN', 'wd', 'WSPM', 'station'],
      dtype='object')

La columna que le falta a test es la de 'PM2.5'

In [7]:
train.head()

Unnamed: 0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
0,2013,3,1,0,4.0,4.0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Aotizhongxin
1,2013,3,1,1,8.0,8.0,4.0,7.0,300.0,77.0,-1.1,1023.2,-18.2,0.0,N,4.7,Aotizhongxin
2,2013,3,1,4,3.0,3.0,12.0,12.0,300.0,72.0,-2.0,1025.2,-19.5,0.0,N,2.0,Aotizhongxin
3,2013,3,1,5,5.0,5.0,18.0,18.0,400.0,66.0,-2.2,1025.6,-19.6,0.0,N,3.7,Aotizhongxin
4,2013,3,1,6,3.0,3.0,18.0,32.0,500.0,50.0,-2.6,1026.5,-19.1,0.0,NNE,2.5,Aotizhongxin


In [8]:
train.describe()

Unnamed: 0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,WSPM
count,306007.0,306007.0,306007.0,306007.0,306007.0,306007.0,306007.0,306007.0,306007.0,306007.0,306007.0,306007.0,306007.0,306007.0,306007.0
mean,2014.7135,6.482,15.7108,11.5765,79.3684,104.5362,15.6463,50.5752,1229.0649,57.3382,13.5307,1010.8046,2.4285,0.0654,1.738
std,1.1597,3.4478,8.797,6.9333,80.0549,91.2367,21.3227,35.0492,1155.3297,56.6731,11.418,10.4483,13.7953,0.8202,1.2416
min,2013.0,1.0,1.0,0.0,2.0,2.0,0.2856,2.0,100.0,0.2142,-19.9,982.4,-36.0,0.0,0.0
25%,2014.0,3.0,8.0,6.0,20.0,36.0,2.0,23.0,500.0,10.4958,3.1,1002.4,-9.0,0.0,0.9
50%,2015.0,6.0,16.0,12.0,55.0,82.0,7.0,43.0,900.0,45.0,14.5,1010.4,3.0,0.0,1.4
75%,2016.0,9.0,23.0,18.0,110.0,145.0,19.0,71.0,1500.0,82.0,23.2,1019.0,15.1,0.0,2.2
max,2017.0,12.0,31.0,23.0,835.0,999.0,500.0,290.0,10000.0,1071.0,41.6,1042.8,29.1,72.5,13.2


## Limpiar información

In [9]:
train.drop(columns=['wd','station'], inplace=True)
test.drop(columns=['wd','station'], inplace=True)

In [10]:
# Check missing values
null_rows_train = train.isna().sum()
print(null_rows_train[null_rows_train>0])
'No hay valores vacíos'

Series([], dtype: int64)


'No hay valores vacíos'

## Elegir modelo

### Tiene que ser un modelo de regresión.
- Lineal regresion
- Logistic Regresion


Voy a elegir regresión lineal

## Definir Variables

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [16]:
climate_model=LinearRegression()

## Preparing the Data

### Separate dataset into two parts

In [26]:
climate_x=train.drop(columns=['PM2.5'])
climate_y=train['PM2.5']

### Create the model

In [27]:
X_train, X_test, y_train, y_test = train_test_split(climate_x,climate_y,test_size=0.2,train_size=0.8,shuffle=True)

In [28]:
climate_model.fit(climate_x,climate_y)
y_pred=climate_model.predict(X_test)

In [33]:
y_pred

array([153.30702895,  51.40843475, 172.41452371, ...,  65.54374403,
       132.21948633,  43.44478929])

## Verificación R2

In [30]:
climate_model.intercept_

-1533.524837667911

In [31]:
climate_model.coef_

array([ 0.57157526, -0.44003013, -0.10581221, -0.03678728,  0.54482261,
        0.130383  ,  0.06515373,  0.02006974,  0.10086945, -1.11509064,
        0.38190388,  1.44531738, -0.56708231, -0.50699219])

In [32]:
climate_model.score(climate_x, climate_y)

0.8556838072337234

## Exportar y_pred

In [38]:
from numpy import savetxt
savetxt('y_pred.csv', y_pred, delimiter=',')