In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn import metrics
import numpy as np
from sklearn.linear_model import LinearRegression as LinReg
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score as cvs

In [2]:
df=pd.read_csv("measurementsclean.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun
0,0,28.0,5.0,26,21.934911,12,Unknown,E10,0,0,0
1,1,12.0,4.2,30,21.934911,13,Unknown,E10,0,0,0
2,2,11.2,5.5,38,21.934911,15,Unknown,E10,0,0,0
3,3,12.9,3.9,36,21.934911,14,Unknown,E10,0,0,0
4,4,18.5,4.5,46,21.934911,15,Unknown,E10,0,0,0


In [5]:
X = df.drop(["Unnamed: 0","consume"], axis=1) 
X.head()

Unnamed: 0,distance,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun
0,28.0,26,21.934911,12,Unknown,E10,0,0,0
1,12.0,30,21.934911,13,Unknown,E10,0,0,0
2,11.2,38,21.934911,15,Unknown,E10,0,0,0
3,12.9,36,21.934911,14,Unknown,E10,0,0,0
4,18.5,46,21.934911,15,Unknown,E10,0,0,0


In [6]:
y=df.consume

### Convierto las columnas categóricas a numéricas:

In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   distance      388 non-null    float64
 1   speed         388 non-null    int64  
 2   temp_inside   388 non-null    float64
 3   temp_outside  388 non-null    int64  
 4   specials      388 non-null    object 
 5   gas_type      388 non-null    object 
 6   AC            388 non-null    int64  
 7   rain          388 non-null    int64  
 8   sun           388 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 27.4+ KB


In [15]:
X.specials.unique()

array(['Unknown', 'AC rain', 'AC', 'rain', 'snow', 'AC snow',
       'half rain half sun', 'sun', 'AC sun', 'sun ac', 'ac', 'AC Sun',
       'ac rain'], dtype=object)

In [16]:
dic_specials ={"AC rain":1,
               "ac rain":1,
                "AC":2,
                "ac":2,
                "rain":1,
                "snow":3,
                "AC snow":3,
                "half rain half sun":4,
                "sun": 5,
                "AC sun": 5,
                "sun ac": 5,
                "AC Sun":5,
                "Unknown":7} 

In [17]:
X.specials = X.specials.map(dic_specials)

In [18]:
X["gas_type"].unique()

array(['E10', 'SP98'], dtype=object)

In [20]:
dicc_gas={"E10":0,"SP98":1}

In [21]:
X["gas_type"] = X["gas_type"].map(dicc_gas)

In [22]:
X_train, X_test, y_train, y_test = tts(X,y, test_size=0.2)

In [23]:
models={
    'ridge': Ridge(),
    'lasso': Lasso(),
    'sgd': SGDRegressor(),
    'knn': KNeighborsRegressor(),
    'grad': GradientBoostingRegressor(),
}

In [24]:
for name, model in models.items():
    print("ENTRENANDO: ", name)
    model.fit(X_train, y_train)

ENTRENANDO:  ridge
ENTRENANDO:  lasso
ENTRENANDO:  sgd
ENTRENANDO:  knn
ENTRENANDO:  grad


In [25]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"------{name}------")
    print('MAE - ', metrics.mean_absolute_error(y_test, y_pred))
    print('MSE - ', metrics.mean_squared_error(y_test, y_pred))
    print('RMSE - ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('R2 - ', metrics.r2_score(y_test, y_pred))

------ridge------
MAE -  0.5915556390285266
MSE -  0.6332090689017884
RMSE -  0.7957443489600089
R2 -  -0.012519409694171424
------lasso------
MAE -  0.5863373677641668
MSE -  0.5938663515550878
RMSE -  0.7706272455312542
R2 -  0.05039072046668458
------sgd------
MAE -  2781777071543.0493
MSE -  2.41440185751784e+25
RMSE -  4913656334663.465
R2 -  -3.860697617263027e+25
------knn------
MAE -  0.46461538461538454
MSE -  0.3931897435897436
RMSE -  0.6270484379932252
R2 -  0.37127835555520505
------grad------
MAE -  0.4349697461504338
MSE -  0.4202707359577462
RMSE -  0.6482829135167347
R2 -  0.32797507429623884


In [27]:
for name, model in models.items():
    scores=cvs(model, X, y, scoring='r2', cv=5)
    print('Modelo: ', name, 'Score: ', np.mean(scores))

Modelo:  ridge Score:  -0.24517033785848322
Modelo:  lasso Score:  -0.28426366190194124
Modelo:  sgd Score:  -1.3318385155302547e+25
Modelo:  knn Score:  0.3493454625360017
Modelo:  grad Score:  0.40623622869965026


Me quedo con el modelo KNeighborsRegressor puesto que tiene el RMSE menor.