In [234]:
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt 
import numpy as np 


In [235]:
mpg_df = sns.load_dataset("mpg")

mpg_df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [236]:
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [237]:
relevant_columns = ["mpg", "cylinders" ,"horsepower", "weight", "model_year"]

mpg_df[relevant_columns].describe()

Unnamed: 0,mpg,cylinders,horsepower,weight,model_year
count,398.0,398.0,392.0,398.0,398.0
mean,23.514573,5.454774,104.469388,2970.424623,76.01005
std,7.815984,1.701004,38.49116,846.841774,3.697627
min,9.0,3.0,46.0,1613.0,70.0
25%,17.5,4.0,75.0,2223.75,73.0
50%,23.0,4.0,93.5,2803.5,76.0
75%,29.0,8.0,126.0,3608.0,79.0
max,46.6,8.0,230.0,5140.0,82.0


In [238]:
gray, red = "#4b5563", "#be123c" 

mpg5 = mpg_df.head()  

fig = px.bar(mpg5,
            x="name",
            y="mpg",
            title="Miles per gallon on different car models"

)  
fig.update_traces(marker_color=[red, gray, red, gray, gray ])
fig.show()

We want to predict the "mpg", split up X and y, and perform train|test split using scikit-learn. 

Choose test_size of 0.2 and random_state 42. Control the shapes of each X_train, X_test, y_train, y_test


In [239]:
mpg_df = mpg_df.drop(["name", "origin"], axis="columns")
mpg_df = mpg_df.dropna()     

In [240]:
X, y = mpg_df.drop("mpg", axis="columns"), mpg_df["mpg"]

X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year
0,8,307.0,130.0,3504,12.0,70
1,8,350.0,165.0,3693,11.5,70
2,8,318.0,150.0,3436,11.0,70
3,8,304.0,150.0,3433,12.0,70
4,8,302.0,140.0,3449,10.5,70
...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82
394,4,97.0,52.0,2130,24.6,82
395,4,135.0,84.0,2295,11.6,82
396,4,120.0,79.0,2625,18.6,82


In [241]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"{X_train.shape= }")
print(f"{y_train.shape= }")
print(f"{X_test.shape= }")
print(f"{y_test.shape= }")

X_train.shape= (313, 6)
y_train.shape= (313,)
X_test.shape= (79, 6)
y_test.shape= (79,)


In [242]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(X_train)
scaler

In [243]:
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)
print(f"{scaled_X_train.shape = }")
print(f"{scaled_X_train.min() = }")
print(f"{scaled_X_train.max() = }")

scaled_X_train.shape = (313, 6)
scaled_X_train.min() = 0.0
scaled_X_train.max() = 1.0


In [244]:
print(f"{scaled_X_test.shape = }")
print(f"{scaled_X_test.min() = }")
print(f"{scaled_X_test.max() = }")

scaled_X_test.shape = (79, 6)
scaled_X_test.min() = -0.0051948051948051965
scaled_X_test.max() = 1.0


In [245]:
from sklearn.linear_model import LinearRegression

#model = LinearRegression()

#model.fit(scaled_X_train, y_train)
print(f"{model.intercept_ = }")
print(f"{model.coef_ = }")

model.intercept_ = 27.71548625915781
model.coef_ = array([ -0.580865  ,   0.39018741,  -0.41884721, -23.14066817,
         1.03715653,   9.12763733])


In [246]:
scaled_X_test[0]


array([0.2       , 0.06753247, 0.125     , 0.1633116 , 0.5952381 ,
       0.16666667])

In [247]:
y_pred = model.predict(scaled_X_test)
y_pred

array([25.93279618, 26.29927859, 32.96548909, 26.8544302 , 29.45372581,
       29.08699476,  7.63874768, 29.23704446, 20.87736024, 28.91582943,
       12.48503673, 23.69898277, 16.41889199, 28.19668802, 22.03967916,
       30.88152399, 21.18311073, 31.64662323, 27.91649795, 29.78148169,
       19.84374006, 34.37342564, 33.99070737, 15.02368137, 28.76984129,
       25.99468114, 20.82555389, 16.63300722, 28.50008896, 23.82955469,
       12.81866721, 23.60612254, 21.36279705, 29.88377781, 11.05575113,
       34.7003572 , 10.84515757, 26.26088449, 11.58087074,  7.71809963,
       12.77118771, 27.81143288, 34.74146456, 26.49024951, 11.3582456 ,
        9.10855962, 17.66893597, 31.4355561 , 25.43286643, 30.38576336,
       11.60450412, 25.152572  , 24.52253515, 33.49442837, 28.90462724,
       17.82504408, 20.7761521 , 23.16942494, 23.10923128, 24.6273826 ,
        7.683821  , 22.9296696 , 27.33070582, 23.09519024, 28.13950034,
       28.69699507, 26.85355066, 29.61277981, 22.23472064,  9.37

In [248]:
y_test.to_numpy()

array([26. , 21.6, 36.1, 26. , 27. , 28. , 13. , 26. , 19. , 29. , 15. ,
       19. , 16.9, 29. , 16.2, 28. , 20. , 32.4, 27.4, 35. , 22. , 44. ,
       34.1, 18. , 26. , 26. , 21. , 16. , 26. , 22. , 15. , 19.4, 19.2,
       31.6, 13. , 38. , 14. , 25. , 13. , 10. , 13. , 24. , 36. , 26. ,
       14. , 13. , 19. , 37.3, 21. , 29. , 14. , 24. , 28. , 36. , 22. ,
       16. , 20. , 20. , 25. , 25. , 12. , 22.5, 26.8, 23. , 21.1, 25. ,
       25.1, 29. , 19.9, 15. , 22. , 13. , 22. , 18. , 17.6, 28. , 15. ,
       16. , 27. ])

In [249]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
# MAE
mean_absolute_error(y_test, y_pred)


2.503860089776124

In [250]:
mean_squared_error(y_test, y_pred)

10.502370329417305

In [251]:
np.sqrt(mean_squared_error(y_test, y_pred))

3.240736078334258

In [252]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

def evaluation(X_train, X_test, y_train, y_test, model):
    
    scaled_X_train = scaler.transform(X_train)
    scaled_X_test = scaler.transform(X_test)

    model = LinearRegression()
    model.fit(scaled_X_train, y_train)

    y_pred = model.predict(scaled_X_test)
    print(f"RMSE = {np.sqrt(mean_squared_error(y_test, y_pred))}") 
    print(f"MSE = {mean_squared_error(y_test, y_pred)}")
    print(f"MAE = {mean_absolute_error(y_test, y_pred)}")


evaluation(X_train, X_test, y_train, y_test, model)


RMSE = 3.240736078334258
MSE = 10.502370329417305
MAE = 2.503860089776124
