### Importamos las librerias que vamos a utilizar

In [116]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,RidgeCV,SGDRegressor,ElasticNet,Lasso
from sklearn.ensemble import ExtraTreesRegressor,GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [98]:
data = pd.read_csv("data_cleaned.csv")

In [99]:
data.head()

Unnamed: 0.1,Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas,km_absolute,Consume_ac,consume_ac
0,0,28.0,5.0,26,21.5,12.0,0,E10,0,0,0,45.0,E10,28.0,5.0,5.0
1,1,12.0,4.2,30,21.5,13.0,0,E10,0,0,0,0.0,0,40.0,9.2,9.2
2,2,11.2,5.5,38,21.5,15.0,0,E10,0,0,0,0.0,0,51.2,14.7,14.7
3,3,12.9,3.9,36,21.5,14.0,0,E10,0,0,0,0.0,0,64.1,18.6,18.6
4,4,18.5,4.5,46,21.5,15.0,0,E10,0,0,0,0.0,0,82.6,23.1,23.1


In [100]:
data.isnull().sum()

Unnamed: 0       0
distance         0
consume          0
speed            0
temp_inside      0
temp_outside     0
specials         0
gas_type         0
AC               0
rain             0
sun              0
refill liters    0
refill gas       0
km_absolute      0
Consume_ac       0
consume_ac       0
dtype: int64

In [102]:
# Eliminamos las columnas que no vamos a utilizar
data_n = data.drop(columns=["Consume_ac","km_absolute","consume_ac","rain","specials","refill gas"])
data_n

Unnamed: 0.1,Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,sun,refill liters
0,0,28.0,5.0,26,21.5,12.0,E10,0,0,45.0
1,1,12.0,4.2,30,21.5,13.0,E10,0,0,0.0
2,2,11.2,5.5,38,21.5,15.0,E10,0,0,0.0
3,3,12.9,3.9,36,21.5,14.0,E10,0,0,0.0
4,4,18.5,4.5,46,21.5,15.0,E10,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...
383,383,16.0,3.7,39,24.5,18.0,SP98,0,0,0.0
384,384,16.1,4.3,38,25.0,31.0,SP98,1,0,0.0
385,385,16.0,3.8,45,25.0,19.0,SP98,0,0,0.0
386,386,15.4,4.6,42,25.0,31.0,SP98,1,0,0.0


In [103]:
# hacemos un replace para que aparezca el tipo de combustible por categoria
data_n["gas_type"].replace({"E10": 0, "SP98": 1}, inplace=True)

In [104]:
# comprobamos que todos los datos son int o floats
data_n.dtypes

Unnamed: 0         int64
distance         float64
consume          float64
speed              int64
temp_inside      float64
temp_outside     float64
gas_type           int64
AC                 int64
sun                int64
refill liters    float64
dtype: object

In [105]:
# la variable a predecir será el consumo
X = data_n.drop(columns = 'consume')
y = data_n['consume']

In [106]:
# hacemos Train , test split
X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8)

In [107]:
# normalizamos los datos
norm = MinMaxScaler()

X_train = norm.fit_transform(X_train)
X_test = norm.transform(X_test)

In [108]:
# estandarizamos los datos
stan = StandardScaler()
stan.fit(X_train)

X_train = stan.transform(X_train)
X_test = stan.transform(X_test)

### Regresion Lineal

In [109]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [110]:
y_pred_train_lr = lr.predict(X_train)
y_pred_test_lr = lr.predict(X_test)

### Train

In [111]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_lr ))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_lr ))
print('R2 Score:', r2_score(y_train, y_pred_train_lr ))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_lr )))

Mean Absolute Error: 0.577562128202498
Mean Squared Error: 0.701840554646989
R2 Score: 0.23427008026315943
Root Mean Squared Error: 0.8377592462318688


### Test

In [112]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_lr ))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_lr ))
print('R2 Score:', r2_score(y_test, y_pred_test_lr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_lr )))

Mean Absolute Error: 0.6983270240455038
Mean Squared Error: 1.4015487695903066
R2 Score: 0.14692322787829581
Root Mean Squared Error: 1.1838702503189724


In [118]:

#Cross validation
scores_lr = cross_val_score(lr, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_lr

array([-0.99109741, -0.69630487, -0.86500448])

## Ridge

In [114]:
clf = Ridge(alpha=1.0)
clf.fit(X_train, y_train)

Ridge()

In [117]:
y_pred_train_ridge = clf.predict(X_train)
y_pred_test_ridge = clf.predict(X_test)

### Train

In [119]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_ridge ))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_ridge ))
print('R2 Score:', r2_score(y_train, y_pred_train_ridge ))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_ridge )))

Mean Absolute Error: 0.5774641762132209
Mean Squared Error: 0.701842514811268
R2 Score: 0.2342679416628376
Root Mean Squared Error: 0.8377604161162474


### Test

In [120]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_ridge ))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_ridge ))
print('R2 Score:', r2_score(y_test, y_pred_test_ridge))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_ridge)))

Mean Absolute Error: 0.698041197546579
Mean Squared Error: 1.4018381829635538
R2 Score: 0.14674707144934007
Root Mean Squared Error: 1.1839924758897558


## Lasso

In [123]:
lasso = Lasso()
lasso.fit(X_train, y_train)

Lasso()

In [124]:
y_pred_train_lasso = lasso.predict(X_train)
y_pred_test_lasso = lasso.predict(X_test)

### Train

In [125]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_lasso ))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_lasso ))
print('R2 Score:', r2_score(y_train, y_pred_train_lasso ))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_lasso )))

Mean Absolute Error: 0.6683517169614988
Mean Squared Error: 0.9165640998959418
R2 Score: 0.0
Root Mean Squared Error: 0.9573735425088484


### Test

In [128]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_lasso ))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_lasso ))
print('R2 Score:', r2_score(y_test, y_pred_test_lasso))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_lasso )))

Mean Absolute Error: 0.6890074441687344
Mean Squared Error: 1.6560421862909895
R2 Score: -0.007978568731111135
Root Mean Squared Error: 1.2868730264835726


### GradientBoostingRegressor

In [130]:
gbr_reg = GradientBoostingRegressor()
gbr_reg.fit(X_train, y_train)

GradientBoostingRegressor()

In [131]:
y_pred_train_gbr = gbr_reg.predict(X_train)
y_pred_test_gbr = gbr_reg.predict(X_test)

### Train

In [132]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_gbr ))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_gbr ))
print('R2 Score:', r2_score(y_train, y_pred_train_gbr ))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_gbr )))

Mean Absolute Error: 0.21575726994424207
Mean Squared Error: 0.0778527757955859
R2 Score: 0.9150601951304611
Root Mean Squared Error: 0.2790211027782413


### Test

In [133]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_gbr ))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_gbr ))
print('R2 Score:', r2_score(y_test, y_pred_test_gbr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_gbr )))

Mean Absolute Error: 0.5055067007284821
Mean Squared Error: 0.7072003292220553
R2 Score: 0.5695503523059637
Root Mean Squared Error: 0.840952037408826


### Random Forest

In [134]:
rfreg = RandomForestRegressor()
rfreg.fit(X_train, y_train)

RandomForestRegressor()

In [137]:
y_pred_train_rfreg = rfreg.predict(X_train)
y_pred_test_rfreg = rfreg.predict(X_test)

### Train

In [138]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_rfreg ))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_rfreg ))
print('R2 Score:', r2_score(y_train, y_pred_train_rfreg ))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_rfreg )))

Mean Absolute Error: 0.1633290322580641
Mean Squared Error: 0.05550524516129023
R2 Score: 0.939442047569186
Root Mean Squared Error: 0.23559551175964757


### Test

In [141]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_rfreg ))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_rfreg ))
print('R2 Score:', r2_score(y_test, y_pred_test_rfreg))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_rfreg )))

Mean Absolute Error: 0.4690000000000002
Mean Squared Error: 0.7228672820512827
R2 Score: 0.5600143919180516
Root Mean Squared Error: 0.8502160208154647
