In [154]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [159]:
data= pd.read_csv('../data/measurements.csv', sep=",", decimal=",")

In [160]:
data.head(5)

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28.0,5.0,26,21.5,12,,E10,0,0,0,45.0,E10
1,12.0,4.2,30,21.5,13,,E10,0,0,0,,
2,11.2,5.5,38,21.5,15,,E10,0,0,0,,
3,12.9,3.9,36,21.5,14,,E10,0,0,0,,
4,18.5,4.5,46,21.5,15,,E10,0,0,0,,


Hemos decidido borrar las columna specials porque los datos están en las columnas de AC, rain, sun

In [161]:
data.drop(columns = ['specials','refill gas','refill liters'],axis = 1, inplace = True)

In [162]:
data.head(5)

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain,sun
0,28.0,5.0,26,21.5,12,E10,0,0,0
1,12.0,4.2,30,21.5,13,E10,0,0,0
2,11.2,5.5,38,21.5,15,E10,0,0,0
3,12.9,3.9,36,21.5,14,E10,0,0,0
4,18.5,4.5,46,21.5,15,E10,0,0,0


Mostramos los nulos en orde ascendente

In [163]:
data.isnull().sum().sort_values(ascending=False)

temp_inside     12
sun              0
rain             0
AC               0
gas_type         0
temp_outside     0
speed            0
consume          0
distance         0
dtype: int64

Como tenemos 12 nulos en temp_inside los vamos a rellenar con la temperatura media.

In [164]:
data.dtypes

distance        float64
consume         float64
speed             int64
temp_inside     float64
temp_outside      int64
gas_type         object
AC                int64
rain              int64
sun               int64
dtype: object

In [165]:
mean_temp_inside = data['temp_inside'].mean()

In [166]:
data = data.fillna(mean_temp_inside)

In [167]:
data

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain,sun
0,28.0,5.0,26,21.5,12,E10,0,0,0
1,12.0,4.2,30,21.5,13,E10,0,0,0
2,11.2,5.5,38,21.5,15,E10,0,0,0
3,12.9,3.9,36,21.5,14,E10,0,0,0
4,18.5,4.5,46,21.5,15,E10,0,0,0
...,...,...,...,...,...,...,...,...,...
383,16.0,3.7,39,24.5,18,SP98,0,0,0
384,16.1,4.3,38,25.0,31,SP98,1,0,0
385,16.0,3.8,45,25.0,19,SP98,0,0,0
386,15.4,4.6,42,25.0,31,SP98,1,0,0


In [168]:
data.isnull().sum().sort_values(ascending=False)

sun             0
rain            0
AC              0
gas_type        0
temp_outside    0
temp_inside     0
speed           0
consume         0
distance        0
dtype: int64

In [169]:
gas_type_dum=pd.get_dummies(data['gas_type'])

In [170]:
gas_type_dum

Unnamed: 0,E10,SP98
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
383,0,1
384,0,1
385,0,1
386,0,1


In [171]:
data=pd.concat([data,gas_type_dum],axis=1)

In [172]:
data.head(5)

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain,sun,E10,SP98
0,28.0,5.0,26,21.5,12,E10,0,0,0,1,0
1,12.0,4.2,30,21.5,13,E10,0,0,0,1,0
2,11.2,5.5,38,21.5,15,E10,0,0,0,1,0
3,12.9,3.9,36,21.5,14,E10,0,0,0,1,0
4,18.5,4.5,46,21.5,15,E10,0,0,0,1,0


In [173]:
data.drop('gas_type',axis=1,inplace=True)

In [174]:
data.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,AC,rain,sun,E10,SP98
0,28.0,5.0,26,21.5,12,0,0,0,1,0
1,12.0,4.2,30,21.5,13,0,0,0,1,0
2,11.2,5.5,38,21.5,15,0,0,0,1,0
3,12.9,3.9,36,21.5,14,0,0,0,1,0
4,18.5,4.5,46,21.5,15,0,0,0,1,0


In [175]:
data.to_csv('../data/data_clean.csv',index = False, header=True)

In [146]:
l = LinearRegression()

In [147]:
x = data.drop(['consume'],axis=1)

In [148]:
y = data['consume']

In [149]:
l.fit(x,y)

LinearRegression()

In [150]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [151]:
l.fit(x_train,y_train)

LinearRegression()

In [152]:
y_pred=l.predict(x_test)

In [156]:
l.coef_,l.intercept_

(array([ 0.00515061, -0.02411683, -0.15047909, -0.03631087,  0.42096231,
         0.62794039, -0.06115135,  0.04191564, -0.04191564]),
 9.476307887416771)

In [155]:
print(metrics.mean_squared_error(y_test,y_pred))
print(metrics.mean_absolute_error(y_test,y_pred))
print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

0.747694688029747
0.6689496242764844
0.864693406954018
