In [4]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [5]:
mpg_df = pd.read_csv("/content/auto-mpg.csv")

In [6]:
mpg_df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [7]:
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [8]:
mpg_df[mpg_df['horsepower'].str.isnumeric()==False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
32,25.0,4,98.0,?,2046,19.0,71,1,ford pinto
126,21.0,6,200.0,?,2875,17.0,74,1,ford maverick
330,40.9,4,85.0,?,1835,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,?,2905,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,?,2320,15.8,81,2,renault 18i
374,23.0,4,151.0,?,3035,20.5,82,1,amc concord dl


In [9]:
mpg_df=mpg_df.replace('?',np.nan)

In [10]:
mpg_df=mpg_df.drop('car name',axis=1)

In [11]:
mpg_df=mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)

In [12]:
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(3), int64(4), object(1)
memory usage: 25.0+ KB


In [13]:
X = mpg_df.drop('mpg', axis=1)
y = mpg_df[['mpg']]

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [15]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cylinders is -0.3933914726185951
The coefficient for displacement is 0.0228189485626794
The coefficient for horsepower is -0.01987288429942265
The coefficient for weight is -0.007040899059965791
The coefficient for acceleration is 0.06119510660375097
The coefficient for model year is 0.7925439069034931
The coefficient for origin is 1.198885770256523


In [16]:
intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is -18.745559500957693


In [17]:
ridge = Ridge(alpha=.3) #coefficients are prevented to become too big by this alpha value
ridge.fit(X_train,y_train)
for i,col in enumerate(X_train.columns):
    print ("Ridge model coefficients for {} is: {}".format(col,ridge.coef_[0][i]))

Ridge model coefficients for cylinders is: -0.3910109773015278
Ridge model coefficients for displacement is: 0.02275383240832408
Ridge model coefficients for horsepower is: -0.019847121051076663
Ridge model coefficients for weight is: -0.007041049335966887
Ridge model coefficients for acceleration is: 0.0610442553551988
Ridge model coefficients for model year is: 0.7924614648057181
Ridge model coefficients for origin is: 1.1951487231119182


In [18]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
for i,col in enumerate(X_train):
    print ("Lasso model coefficients for {} is: {}".format(col,lasso.coef_[i]))

Lasso model coefficients for cylinders is: -0.0
Lasso model coefficients for displacement is: 0.013673889482069441
Lasso model coefficients for horsepower is: -0.020033878247769584
Lasso model coefficients for weight is: -0.006941906453989721
Lasso model coefficients for acceleration is: 0.01731938106285126
Lasso model coefficients for model year is: 0.7829827416740949
Lasso model coefficients for origin is: 0.8740281623154403


In [19]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))

0.8081802739111359
0.8472274567567305


In [20]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.8081801786048972
0.8471949402532152


In [21]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.8070482564532553
0.8423221942478804
