# T05: MOTOR TREND CAR ROAD TESTS
Santiago Reyes Castillo  
745826  
20 de febrero, 2025

In [2296]:
# Librerías
from scipy import stats
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

import matplotlib.pyplot as plt

import statsmodels.api as sm

## REGRESIÓN SIN DUMMIES

### MPG

#### REGRESIÓN SIMPLE

In [2300]:
data = pd.read_excel("Motor Trend Car Road Tests.xlsx")

In [2301]:
x = data.copy()
x = x.drop(columns=["model", "mpg"])

n = len(x)

ones = np.ones([n, 1])

X = np.hstack((ones, x))

y = data["mpg"]

In [2302]:
x.head(1)

Unnamed: 0,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,6,160.0,110,3.9,2.62,16.46,0,1,4,4


In [2303]:
#generamos el modelo
ols = sm.OLS(y,X)
ols_results = ols.fit()
ols_results.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.869
Model:,OLS,Adj. R-squared:,0.807
Method:,Least Squares,F-statistic:,13.93
Date:,"Thu, 20 Feb 2025",Prob (F-statistic):,3.79e-07
Time:,22:56:48,Log-Likelihood:,-69.855
No. Observations:,32,AIC:,161.7
Df Residuals:,21,BIC:,177.8
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,12.3034,18.718,0.657,0.518,-26.623,51.229
x1,-0.1114,1.045,-0.107,0.916,-2.285,2.062
x2,0.0133,0.018,0.747,0.463,-0.024,0.050
x3,-0.0215,0.022,-0.987,0.335,-0.067,0.024
x4,0.7871,1.635,0.481,0.635,-2.614,4.188
x5,-3.7153,1.894,-1.961,0.063,-7.655,0.224
x6,0.8210,0.731,1.123,0.274,-0.699,2.341
x7,0.3178,2.105,0.151,0.881,-4.059,4.694
x8,2.5202,2.057,1.225,0.234,-1.757,6.797

0,1,2,3
Omnibus:,1.907,Durbin-Watson:,1.861
Prob(Omnibus):,0.385,Jarque-Bera (JB):,1.747
Skew:,0.521,Prob(JB):,0.418
Kurtosis:,2.526,Cond. No.,12200.0


In [2304]:
errores_mpg = {}
errores_mpg['REGRESION SIMPLE'] = 0.869

#### INTERPRETACION R2 Y BETAS
El r2 (0.869) indica un buen ajuste del modelo a los datos, ya que el 86% de la varianza es explicada por las variables de entrada del modelo.   

Las betas nos indican que, entre mayor cantidad de cilíndros, peso, caballos de fuerza y carburadores, se reduce el mpg, mientras que a mayor displacement, direction ratio, qsec, forma del motor, tipo de transmisión, y velocidades, aumenta el mpg. Sin embargo, varias de estas últimas son incieras ya que son variables categóricas que no se modelaron como tal.

#### TRAIN TEST

In [2307]:
x = data.copy()
x = x.drop(columns=["model", "mpg"])
X = x

In [2308]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.4, random_state=2)

In [2309]:
scaler = StandardScaler()
scaler.fit(X_train)

In [2310]:
X_std_train = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [2311]:
model = LinearRegression()

model.fit(X_std_train, y_train)

y_pred_train = model.predict(X_std_train)
y_pred_test = model.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

errores_mpg["R2 REGRESIÓN TRAIN"] = r2_train
errores_mpg["R2 REGRESIÓN TEST"] = r2_test

print(r2_train, r2_test)

0.997079967049818 -5.411243537583586


#### REGRESIÓN CON RIDGE

In [2313]:
ridge = Ridge(alpha=5)

ridge.fit(X_std_train, y_train)

y_pred_train = ridge.predict(X_std_train)
y_pred_test = ridge.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(r2_train, r2_test)

0.8843591594843405 0.7723644436945166


In [2314]:
ridge = Ridge(alpha=15)

ridge.fit(X_std_train, y_train)

y_pred_train = ridge.predict(X_std_train)
y_pred_test = ridge.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(r2_train, r2_test)

0.8547581660007483 0.7903333477242467


In [2315]:
ridge = Ridge(alpha=25)

ridge.fit(X_std_train, y_train)

y_pred_train = ridge.predict(X_std_train)
y_pred_test = ridge.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

errores_mpg["RIDGE (lambda = 25)"] = r2_test 

print(r2_train, r2_test)

0.8221955359318726 0.7815667794815286


In [2316]:
ridge = Ridge(alpha=1)

ridge.fit(X_std_train, y_train)

y_pred_train = ridge.predict(X_std_train)
y_pred_test = ridge.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(r2_train, r2_test)

0.9035616676595186 0.7207296090623141


### QSEC 

#### REGRESIÓN SIMPLE

In [2319]:
x.head()

Unnamed: 0,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [2320]:
x = data.copy()
x = x.drop(columns=["model", "qsec"])

n = len(x)

ones = np.ones([n, 1])

X = np.hstack((ones, x))

y = data["qsec"]

#generamos el modelo
ols = sm.OLS(y,X)
ols_results = ols.fit()
ols_results.summary()

0,1,2,3
Dep. Variable:,qsec,R-squared:,0.875
Model:,OLS,Adj. R-squared:,0.815
Method:,Least Squares,F-statistic:,14.66
Date:,"Thu, 20 Feb 2025",Prob (F-statistic):,2.44e-07
Time:,22:56:48,Log-Likelihood:,-30.242
No. Observations:,32,AIC:,82.48
Df Residuals:,21,BIC:,98.61
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,17.7762,3.876,4.586,0.000,9.716,25.837
x1,0.0690,0.061,1.123,0.274,-0.059,0.197
x2,-0.3627,0.293,-1.239,0.229,-0.971,0.246
x3,-0.0075,0.005,-1.505,0.147,-0.018,0.003
x4,-0.0016,0.006,-0.242,0.811,-0.015,0.012
x5,-0.1311,0.476,-0.275,0.786,-1.121,0.859
x6,1.4963,0.500,2.990,0.007,0.456,2.537
x7,0.9700,0.573,1.694,0.105,-0.221,2.161
x8,-0.9012,0.585,-1.540,0.139,-2.118,0.316

0,1,2,3
Omnibus:,21.069,Durbin-Watson:,2.573
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38.291
Skew:,1.47,Prob(JB):,4.84e-09
Kurtosis:,7.481,Cond. No.,8770.0


In [2321]:
errores_qsec = {}
errores_qsec["REGRESION SIMPLE"] = 0.875

#### INTERPRETACIÓN R2 Y BETAS

Con un r2 de 0.875, vemos que este modelo hace un buen trabajo prediciendo la variable qsec.

Las betas nos indican que cyl,disp, hp, drat, am, gear, carb reducen el tiempo para llegar al cuarto de milla, mientras que mpg, la forma del motor, y el peso, la aumentan

#### TRAIN TEST

In [2325]:
x = data.copy()
x = x.drop(columns=["model", "mpg"])
X = x

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.4, random_state=2)

scaler = StandardScaler()
scaler.fit(X_train)

X_std_train = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [2326]:
model = LinearRegression()

model.fit(X_std_train, y_train)

y_pred_train = model.predict(X_std_train)
y_pred_test = model.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

errores_qsec["REGRESION TRAIN"] = r2_train
errores_qsec["REGRESION TEST"] = r2_test

print(r2_train, r2_test)

1.0 1.0


#### RIDGE

In [2328]:
ridge = Ridge(alpha=5)

ridge.fit(X_std_train, y_train)

y_pred_train = ridge.predict(X_std_train)
y_pred_test = ridge.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(r2_train, r2_test)

0.9405045917311927 0.9257253132481335


In [2329]:
ridge = Ridge(alpha=15)

ridge.fit(X_std_train, y_train)

y_pred_train = ridge.predict(X_std_train)
y_pred_test = ridge.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(r2_train, r2_test)

0.8324365826141653 0.8596676250457136


In [2330]:
ridge = Ridge(alpha=25)

ridge.fit(X_std_train, y_train)

y_pred_train = ridge.predict(X_std_train)
y_pred_test = ridge.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(r2_train, r2_test)

0.742009937230728 0.7854839707074813


In [2331]:
ridge = Ridge(alpha=1)

ridge.fit(X_std_train, y_train)

y_pred_train = ridge.predict(X_std_train)
y_pred_test = ridge.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

errores_qsec["R2 RIDGE (lambda = 1)"] = r2_test

print(r2_train, r2_test)

0.9882208398246152 0.9633187182704794


## REGRESIÓN CON DUMMIES

### MPG

#### REGRESIÓN SIMPLE

In [2335]:
x = data.copy()
x = x.drop(columns=["model", "mpg"])

y = data["mpg"]

In [2336]:
numeric = ["disp", "hp", "drat", "wt", "qsec"]
categoric = ["cyl", "vs", "am", "gear", "carb"]

In [2337]:
encoded_features = pd.get_dummies(x[categoric],
                                  columns=categoric,
                                  drop_first=False)

numerical_features = x[numeric]

In [2338]:
data_to_model_standarized = StandardScaler().fit_transform(x[numeric])

data_to_model_df = pd.DataFrame(data_to_model_standarized, 
                                columns=numeric).reset_index()

encoded_features_df = encoded_features.reset_index()

data_to_model = data_to_model_df.merge(encoded_features_df, on='index')

data_to_model = data_to_model.drop('index', axis=1)

In [2339]:
x = data_to_model

In [2340]:
model = LinearRegression()
model.fit(x, y)

In [2341]:
model.coef_

array([ 4.33618166, -4.75801768,  0.62247393, -4.36239021,  0.64696571,
        0.99495275, -1.65374252,  0.65878977, -0.96542527,  0.96542527,
       -0.60605785,  0.60605785, -1.21425031, -0.09989537,  1.31414568,
       -2.4732813 , -3.45263562,  0.52635745, -1.38185841,  2.00428791,
        4.77712996])

In [2342]:
model.intercept_

22.24830933134702

In [2343]:
y_pred  = model.predict(x)

errores_mpg_d = {}
errores_mpg_d["REGRESION SIMPLE"] = r2_score(y,y_pred)

r2_score(y, y_pred)

0.8930749320864843

In [2344]:
x.head(1)

Unnamed: 0,disp,hp,drat,wt,qsec,cyl_4,cyl_6,cyl_8,vs_0,vs_1,...,am_1,gear_3,gear_4,gear_5,carb_1,carb_2,carb_3,carb_4,carb_6,carb_8
0,-0.57975,-0.543655,0.576594,-0.620167,-0.789601,False,True,False,True,False,...,True,False,True,False,False,False,False,True,False,False


#### INTERPRETACIÓN R2 Y BETAS

Vemos que, por ejemplo, que a 4 y 8 cilíndros son betas positivas que aumentan el mpg, meintras que 6 cilíndros lo disminuye, por lo que este análisis categórico es útil. De igual manera, cierta cantidad de carburadores aumentan o disminuyen la beta, por ende, es importante saber cómo impacta cada variable categórica. 

#### TRAIN TEST

In [2348]:
x = data.copy()
x = x.drop(columns=["model", "mpg"])

y = data["mpg"]


In [2349]:
numeric = ["disp", "hp", "drat", "wt", "qsec"]
categoric = ["cyl", "vs", "am", "gear", "carb"]

encoded_features = pd.get_dummies(x[categoric],
                                  columns=categoric,
                                  drop_first=False)

numerical_features = x[numeric]

data_to_model_standarized = StandardScaler().fit_transform(x[numeric])

data_to_model_df = pd.DataFrame(data_to_model_standarized, 
                                columns=numeric).reset_index()

encoded_features_df = encoded_features.reset_index()

data_to_model = data_to_model_df.merge(encoded_features_df, on='index')

data_to_model = data_to_model.drop('index', axis=1)

x = data_to_model

In [2350]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.4, random_state=2)


In [2351]:
regression = LinearRegression()

regression.fit(x_train, y_train)

In [2352]:
y_pred_train = regression.predict(x_train)
y_pred_test = regression.predict(x_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

errores_mpg_d["REGRESION TRAIN"] = r2_train
errores_mpg_d["REGRESION TEST"] = r2_test

In [2353]:
r2_train

1.0

In [2354]:
r2_test

-4.2299689784580705

In [2355]:
regression.coef_

array([-1.20174983e+01, -1.45593472e+01,  8.08462582e-01,  7.34494066e+00,
        2.96581422e+00, -1.86263765e+01, -8.88178420e-15,  1.86263765e+01,
       -3.07969561e-02,  3.07969561e-02, -4.72543162e+00,  4.72543162e+00,
       -8.36420739e+00,  3.07969561e-02,  8.33341043e+00, -1.77670853e+00,
       -2.33492094e+00, -2.32931734e+00,  6.44094681e+00,  0.00000000e+00,
        0.00000000e+00])

#### RIDGE

In [2357]:
ridge = Ridge(alpha=5)

ridge.fit(x_train, y_train)

y_pred_train = ridge.predict(x_train)
y_pred_test = ridge.predict(x_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(r2_train, r2_test)

0.8963290134140356 0.7480525984471514


In [2358]:
ridge = Ridge(alpha=15)

ridge.fit(x_train, y_train)

y_pred_train = ridge.predict(x_train)
y_pred_test = ridge.predict(x_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

errores_mpg_d["RIDGE (alpha = 15)"] = r2_test

print(r2_train, r2_test)

0.8511427378571068 0.7569815146527613


In [2359]:
ridge = Ridge(alpha=25)

ridge.fit(x_train, y_train)

y_pred_train = ridge.predict(x_train)
y_pred_test = ridge.predict(x_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(r2_train, r2_test)

0.8056743485708584 0.731157254764693


In [2360]:
ridge = Ridge(alpha=1)

ridge.fit(x_train, y_train)

y_pred_train = ridge.predict(x_train)
y_pred_test = ridge.predict(x_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(r2_train, r2_test)

0.9330075181122204 0.6246045184818407


### QSEC

#### REGRESIÓN SIMPLE

In [2363]:
x = data.copy()
x = x.drop(columns=["model", "qsec"])

y = data["qsec"]

In [2364]:
numeric = ["disp", "hp", "drat", "wt", "mpg"]
categoric = ["cyl", "vs", "am", "gear", "carb"]

In [2365]:
encoded_features = pd.get_dummies(x[categoric],
                                  columns=categoric,
                                  drop_first=False)

numerical_features = x[numeric]

In [2366]:
data_to_model_standarized = StandardScaler().fit_transform(x[numeric])

data_to_model_df = pd.DataFrame(data_to_model_standarized, 
                                columns=numeric).reset_index()

encoded_features_df = encoded_features.reset_index()

data_to_model = data_to_model_df.merge(encoded_features_df, on='index')

data_to_model = data_to_model.drop('index', axis=1)

In [2367]:
x = data_to_model

In [2368]:
model = LinearRegression()
model.fit(x, y)

In [2369]:
model.coef_

array([ 0.43075446, -0.13943274,  0.05676526,  0.78052333,  0.16456271,
        1.35708108,  0.25273853, -1.6098196 , -0.1328661 ,  0.1328661 ,
        0.84714676, -0.84714676, -0.50486661,  0.82741573, -0.32254912,
        0.98471823,  0.15030525,  0.75541386, -0.96231558, -0.58052281,
       -0.34759894])

In [2370]:
model.intercept_

17.862334647434704

In [2371]:
y_pred  = model.predict(x)

errores_qsec_d = {}
errores_qsec_d["REGRESION SIMPLE"] = r2_score(y, y_pred)

#### TRAIN TEST

In [2373]:
x = data.copy()
x = x.drop(columns=["model", "qsec"])

y = data["qsec"]


data_to_model_standarized = StandardScaler().fit_transform(x[numeric])

data_to_model_df = pd.DataFrame(data_to_model_standarized, 
                                columns=numeric).reset_index()

encoded_features_df = encoded_features.reset_index()

data_to_model = data_to_model_df.merge(encoded_features_df, on='index')

data_to_model = data_to_model.drop('index', axis=1)

X = data_to_model

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.4, random_state=2)

scaler = StandardScaler()
scaler.fit(X_train)

X_std_train = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [2374]:
model = LinearRegression()

model.fit(X_std_train, y_train)

y_pred_train = model.predict(X_std_train)
y_pred_test = model.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

errores_qsec_d["R2 TRAIN"] = r2_train
errores_qsec_d["R2 TEST!"] = r2_test

print(r2_train, r2_test)

1.0 -10.855636305430204


#### RIDGE

In [2376]:
ridge = Ridge(alpha=5)

ridge.fit(X_std_train, y_train)

y_pred_train = ridge.predict(X_std_train)
y_pred_test = ridge.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

errores_qsec_d["RIDGE (alpha = 5)"] = r2_test
print(r2_train, r2_test)

0.8543277817618067 0.7256632470977231


In [2377]:
ridge = Ridge(alpha=15)

ridge.fit(X_std_train, y_train)

y_pred_train = ridge.predict(X_std_train)
y_pred_test = ridge.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(r2_train, r2_test)

0.78053190755781 0.6884593857870829


In [2378]:
ridge = Ridge(alpha=25)

ridge.fit(X_std_train, y_train)

y_pred_train = ridge.predict(X_std_train)
y_pred_test = ridge.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(r2_train, r2_test)

0.7175469156091263 0.6290187667482221


In [2379]:
ridge = Ridge(alpha=1)

ridge.fit(X_std_train, y_train)

y_pred_train = ridge.predict(X_std_train)
y_pred_test = ridge.predict(X_test_std)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(r2_train, r2_test)

0.9000360673212355 0.6425296387803276


## COMPARACIÓN DE R2 

### MPG CON Y SIN DUMMIES

In [2382]:
errores_mpg

{'REGRESION SIMPLE': 0.869,
 'R2 REGRESIÓN TRAIN': 0.997079967049818,
 'R2 REGRESIÓN TEST': -5.411243537583586,
 'RIDGE (lambda = 25)': 0.7815667794815286}

In [2383]:
errores_mpg_d

{'REGRESION SIMPLE': 0.8930749320864843,
 'REGRESION TRAIN': 1.0,
 'REGRESION TEST': -4.2299689784580705,
 'RIDGE (alpha = 15)': 0.7569815146527613}

- Luego de haber generado los modelos de regresión para MPG como variable de salida, podemos ver que en el caso de la regresión simple, el generar dummies para modelar las variables categóricas aumentó el R2 de 86% a 89%, un incremento no muy grande pero sí significativo.
- Por otro lado, el modelo sin dummies y con dummies, para la regresión dio R2 intrigantes, ya que al ser valores negativos en la prueba con el set de test, probablemente existe un problema con las variables de entrada que están sesgando la salida de los datos. También es posible que haya alguna repetición de muestras en el test o un descalibre importante.
- Sin embargo, al hacer la regularización L2 para la regresión, el R2 se estabilizó en ambos casos, a pesar de que fue mejor el R2 en el caso del modelo sin dummies.

### QSEC CON Y SIN DUMMIES

In [2386]:
errores_qsec

{'REGRESION SIMPLE': 0.875,
 'REGRESION TRAIN': 1.0,
 'REGRESION TEST': 1.0,
 'R2 RIDGE (lambda = 1)': 0.9633187182704794}

In [2387]:
errores_qsec_d

{'REGRESION SIMPLE': 0.9082689440167555,
 'R2 TRAIN': 1.0,
 'R2 TEST!': -10.855636305430204,
 'RIDGE (alpha = 5)': 0.7256632470977231}

- Cuando modelamos QSEC como salida, vemos que el R2 para la regresión simple mejora en 5% al usar dummies,  lo cual es más significativo.
- El problema con el modelo se vuelve a presentar cuando partimos los datos en train y test, y habría que revisar detenidamente los datos para verificar donde se encuentra el sesgo.
- Similarmente a la otra variable de salida, el modelo con ridge, estabilizó el R2, pero no mejoró a la regresión lineal simple, incluso cuando comparamos con el valor para lambda más efectivo para aumentar el r2.