In [2]:
#The "mpg" dataset, which stands for "miles per gallon". It contains information about various car models and their characteristics, such as cylinders, displacement, horsepower, weight, acceleration, model year, origin, and miles per gallon (mpg) fuel efficiency.

#Here's a brief explanation of each column:

#mpg: Miles per gallon, representing the fuel efficiency of the car.
###cylinders: Number of cylinders in the engine.
###weight: Weight of the car, often measured in pounds.
#acceleration: Acceleration of the car from 0 to 60 miles per hour (mph) in seconds.
#model year: Year of manufacturing of the car model.
#origin: Origin of the car, represented as a categorical variable (1: USA, 2: Europe, 3: Japan).
#name: The name of the car model.
#This dataset is commonly used for regression tasks, where the goal is to predict the fuel efficiency (mpg) of a car based on its other characteristics

In [3]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings 
warnings.filterwarnings('ignore')

In [4]:
df=sns.load_dataset('mpg')

In [5]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [6]:
df.drop("name",axis=1,inplace = True)

In [7]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [8]:
df['horsepower']=df['horsepower'].fillna(df['horsepower'].median())

In [9]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [11]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin           object
dtype: object

In [12]:
df['origin'].value_counts()

origin
usa       249
japan      79
europe     70
Name: count, dtype: int64

In [13]:
df['origin']=df['origin'].map({'usa':1, 'japan':2 , 'europe':3})

In [14]:
df['origin']=df['origin'].astype(int)

In [15]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin            int32
dtype: object

In [16]:
#seperate X and Y
X = df.drop('mpg', axis =1)
y = df['mpg']

In [17]:
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504,12.0,70,1
1,8,350.0,165.0,3693,11.5,70,1
2,8,318.0,150.0,3436,11.0,70,1
3,8,304.0,150.0,3433,12.0,70,1
4,8,302.0,140.0,3449,10.5,70,1
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,1
394,4,97.0,52.0,2130,24.6,82,3
395,4,135.0,84.0,2295,11.6,82,1
396,4,120.0,79.0,2625,18.6,82,1


In [18]:
y

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 398, dtype: float64

In [19]:
#train_test_split 
from sklearn.model_selection import train_test_split 
X_train ,X_test, y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)

In [20]:
X_train.shape , X_test.shape 

((278, 7), (120, 7))

In [21]:
from sklearn.linear_model import LinearRegression 
regression_model =LinearRegression()

In [22]:
regression_model

In [23]:
regression_model.fit(X_train,y_train)

X_train

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
350,4,105.0,63.0,2215,14.9,81,1
59,4,97.0,54.0,2254,23.5,72,3
120,4,121.0,112.0,2868,15.5,73,3
12,8,400.0,150.0,3761,9.5,70,1
349,4,91.0,68.0,1985,16.0,81,2
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,1
255,4,140.0,88.0,2720,15.4,78,1
72,8,304.0,150.0,3892,12.5,72,1
235,4,97.0,75.0,2265,18.2,77,2


In [24]:


for i, col_name in enumerate(X_train.columns):
    print(f"The Coefficent for {col_name} is {regression_model.coef_[i]}")

The Coefficent for cylinders is -0.3176142302799355
The Coefficent for displacement is 0.02623748259907893
The Coefficent for horsepower is -0.018270764913124602
The Coefficent for weight is -0.007487750398361897
The Coefficent for acceleration is 0.0504067346197135
The Coefficent for model_year is 0.8470951427061368
The Coefficent for origin is 1.519095838797505


In [25]:
#coefficients are relatively smaller, if one independent variable change
#slightly there will be not much difference in prediction.
#This is sometime is called smoother model

In [26]:
from sklearn.metrics import r2_score

In [27]:
y_pred_linear=regression_model.predict(X_test)
r2_linear = r2_score(y_test , y_pred_linear)
print(f"R Square of linear regression {r2_linear}")

R Square of linear regression 0.8348001123742286


In [28]:
#Ridge regression 
from sklearn.linear_model import Ridge 
ridge_regression_model=Ridge(alpha=0.1)
ridge_regression_model.fit(X_train,y_train)

In [29]:
for i, col_name in enumerate(X_train.columns):
    print(f"The Coefficent for {col_name} is {ridge_regression_model.coef_[i]}")

The Coefficent for cylinders is -0.3170032101006609
The Coefficent for displacement is 0.026213249757982955
The Coefficent for horsepower is -0.01826325248144886
The Coefficent for weight is -0.0074873260502131105
The Coefficent for acceleration is 0.05036896947442607
The Coefficent for model_year is 0.8470062938903142
The Coefficent for origin is 1.517452828565376


In [30]:
#We don't see much variation in coeff of ridge regression as compared to  linear Regresioon 

In [31]:
from sklearn.linear_model import Lasso

In [32]:
lasso_regression_model=Lasso(alpha=0.5)
lasso_regression_model.fit(X_train, y_train)

for i,col_name in enumerate(X_train.columns):
    print(f"The coeffiecient for{col_name} is {lasso_regression_model.coef_[i]}")

The coeffiecient forcylinders is -0.0
The coeffiecient fordisplacement is 0.006208198888300381
The coeffiecient forhorsepower is -0.011058382987169605
The coeffiecient forweight is -0.00698267316802309
The coeffiecient foracceleration is 0.0
The coeffiecient formodel_year is 0.7446549520038191
The coeffiecient fororigin is 0.0


In [33]:
#3 feature coefficient are 0 , Lasso helps in feature selection 
y_pred_lasso = lasso_regression_model.predict(X_test)
r2_lasso=r2_score(y_test,y_pred_lasso)
print(f"R-Squared score for Lasso Regression :{r2_lasso}")

R-Squared score for Lasso Regression :0.8277934716635554


In [34]:
from sklearn.linear_model import ElasticNet
elastic_net_model=ElasticNet(alpha=1, l1_ratio=0.5 )
elastic_net_model.fit(X_test,y_test)

In [35]:
for i,col_name in enumerate(X_train.columns):
    print(f"The coeffiecient for{col_name} is {elastic_net_model.coef_[i]}")

The coeffiecient forcylinders is -0.0
The coeffiecient fordisplacement is -0.010700611933493196
The coeffiecient forhorsepower is 0.016689487626228995
The coeffiecient forweight is -0.006065381102730179
The coeffiecient foracceleration is 0.07184195495576955
The coeffiecient formodel_year is 0.6306886371945707
The coeffiecient fororigin is 0.0


In [36]:
#predict on the test set 
y_pred_elastic_net = elastic_net_model.predict(X_test)
#Calculate evaluation metrices 
r2_elastic_net=r2_score(y_test, y_pred_elastic_net)
print(f"R-Squared Score for Elastic Net Regression :{r2_elastic_net}")

R-Squared Score for Elastic Net Regression :0.8363728641522763


In [41]:
from sklearn.linear_model import LassoCV
lassoCV=LassoCV(cv=5)
lassoCV.fit(X_train, y_train)
y_pred =lassoCV.predict(X_test)
score=r2_score(y_test,y_pred)
print("R2_Score", score)

R2_Score 0.8082805983844751


In [45]:
from sklearn.linear_model import RidgeCV
ridgeCV=RidgeCV(cv=5)
ridgeCV.fit(X_train,y_train)
y_pred=ridgeCV.predict(X_test)
score=r2_score(y_test , y_pred)
print("R2 Score",score)

R2 Score 0.8354145247502054


In [47]:
ridgeCV.get_params()

{'alpha_per_target': False,
 'alphas': (0.1, 1.0, 10.0),
 'cv': 5,
 'fit_intercept': True,
 'gcv_mode': None,
 'scoring': None,
 'store_cv_results': None,
 'store_cv_values': 'deprecated'}