In [None]:
#Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
#Loads car dataset into DataFrame.
df = pd.read_csv("cars.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   body-style         205 non-null    object 
 5   drive-wheels       205 non-null    object 
 6   engine-location    205 non-null    object 
 7   width              205 non-null    float64
 8   height             205 non-null    float64
 9   engine-type        205 non-null    object 
 10  engine-size        205 non-null    int64  
 11  horsepower         205 non-null    object 
 12  city-mpg           205 non-null    int64  
 13  highway-mpg        205 non-null    int64  
 14  price              205 non-null    int64  
dtypes: float64(2), int64(5), object(8)
memory usage: 24.2+ KB


In [None]:
#Replaces "?" with NaN.
#Required because ML models cannot handle strings or missing values.

df['normalized-losses'].replace("?",np.nan,inplace=True)
df['horsepower'].replace("?",np.nan,inplace=True)

In [None]:
#Used to fill missing values automatically.
from sklearn.impute import  SimpleImputer
#Replaces missing values with mean of the column.
si = SimpleImputer(missing_values=np.nan,strategy="mean")

In [None]:
#Applies mean imputation to selected columns.

df[["normalized-losses","horsepower"]]=si.fit_transform(df[["normalized-losses","horsepower"]])

In [None]:
#Handling categorical data
df.select_dtypes(object)  #Selects columns with categorical (string) values.

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,alfa-romero,gas,convertible,rwd,front,dohc
1,alfa-romero,gas,convertible,rwd,front,dohc
2,alfa-romero,gas,hatchback,rwd,front,ohcv
3,audi,gas,sedan,fwd,front,ohc
4,audi,gas,sedan,4wd,front,ohc
...,...,...,...,...,...,...
200,volvo,gas,sedan,rwd,front,ohc
201,volvo,gas,sedan,rwd,front,ohc
202,volvo,gas,sedan,rwd,front,ohcv
203,volvo,diesel,sedan,rwd,front,ohc


In [None]:
#Stores categorical column names.
catcol=df.select_dtypes(object).columns

In [None]:
catcol

Index(['make', 'fuel-type', 'body-style', 'drive-wheels', 'engine-location',
       'engine-type'],
      dtype='object')

In [None]:
# Converts categorical values into numeric format.
from sklearn.preprocessing import OrdinalEncoder
# Encodes all categorical columns.
oe=OrdinalEncoder()
df[catcol]=oe.fit_transform(df[catcol])

#SPLITING DATA INTO TRAIN & TEST

In [None]:
x=df.iloc[:, :-1] #training
y=df.iloc[:, -1]  #testing

In [None]:
#IMPORT MODEL Used to split dataset.
from sklearn.model_selection import  train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=1)

In [None]:
#INTIALIZE THE MODEL
from sklearn.linear_model import LinearRegression
#Creates model object.
linreg=LinearRegression()

In [None]:
#To train data use .fit()
linreg.fit(xtrain,ytrain)
# Predicts car prices for test data.
ypred=linreg.predict(xtest)

In [None]:
# Used to evaluate regression performance.
from sklearn.metrics import r2_score
#Measures how well the model explains variance.
print(r2_score(ytest,ypred))



0.796556678039738


In [None]:
#Training and testing accuracy.
train=linreg.score(xtrain,ytrain)
test=linreg.score(xtest,ytest)
print(f"training result {train}")
print(f"testing result {test}")

training result 0.8504573774895473
testing result 0.796556678039738


In [None]:
# Shows feature coefficients.
# Large coefficients â†’ possible overfitting.
linreg.coef_

array([ 4.51384957e+01,  1.53127607e+00, -2.00099087e+02, -6.22650015e+02,
       -1.70235175e+02,  1.86860719e+03,  1.64133620e+04,  7.89452171e+02,
        3.62663990e+02,  2.83174279e+02,  9.83682875e+01, -1.08169245e+01,
        3.08017854e+02, -4.17024371e+02])

##IMPLEMENT
RIDGE(L2) REGULARIZATION

In [None]:
#Imports Ridge & Lasso models.
from sklearn.linear_model import Ridge,Lasso

In [None]:
#Ridge model with regularization strength = 10.
# Penalizes large coefficients.

l2=Ridge(alpha=10)   #assigning in variable
#Trains and predicts.
l2.fit(xtrain,ytrain)
ypred=l2.predict(xtest)

In [None]:
# Compares training vs testing performance.
train=l2.score(xtrain,ytrain)
test=l2.score(xtest,ytest)
print(f"training result {train}")
print(f"testing result {test}")

training result 0.8109538582620314
testing result 0.8150222867376526


In [None]:
#Coefficients shrink but do not become zero.
l2.coef_

array([ 2.08658930e+02, -5.60173023e-01, -1.86340249e+02, -9.06610516e+02,
       -6.30655861e+02,  1.56860422e+03,  2.57047785e+03,  3.64420144e+02,
        5.72916414e+02,  5.15948757e+02,  1.04441215e+02,  2.21332730e+01,
        2.11271281e+02, -2.72864381e+02])

##LASSO(L1) REGULARIZATION

1.   Lasso can make coefficients exactly zero
2.   Performs feature selectio


n

In [None]:
#Lasso model with stronger penalty.
l1=Lasso(alpha=169)
#Trains and predicts.
l1.fit(xtrain,ytrain)
ypred=l1.predict(xtest)

In [None]:
# Evaluates performance.
train=l1.score(xtrain,ytrain)
test=l1.score(xtest,ytest)
print(f"training result {train}")
print(f"testing result {test}")

training result 0.8134398734696936
testing result 0.8132785388258221


##Hyperparameter Tunning

In [None]:
# Tries different alpha values.
for i in range(50,200):
  # Trains model with each alpha.
  l1=Lasso(alpha=i)
  l1.fit(xtrain,ytrain)
  # Measures performance.
  train=l1.score(xtrain,ytrain)
  test=l1.score(xtest,ytest)

  #Helps find optimal alpha value.
  print(f"{i} {train} {test} ")

50 0.8470766580894457 0.8039357524111292 
51 0.8469400918997987 0.8040563116971535 
52 0.8468008217042287 0.8041758137792859 
53 0.8466588475145203 0.804294258613894 
54 0.8465141693306735 0.8044116462009783 
55 0.846366753206345 0.8045280890625282 
56 0.8462166665464133 0.8046433613894476 
57 0.8460638758901722 0.8047575764926311 
58 0.8459083812440562 0.8048707343507517 
59 0.8457501826085781 0.8049828349622445 
60 0.8455892418755834 0.8050939923124314 
61 0.8454256348053019 0.8052039775493058 
62 0.845259323750945 0.8053129055444648 
63 0.8450903087125129 0.8054207762979084 
64 0.8449185896900054 0.805527589809637 
65 0.8447441242480676 0.8056334613836723 
66 0.8445669968426667 0.8057381594093508 
67 0.8443871654497523 0.8058418002250007 
68 0.8442046300591084 0.8059443838554217 
69 0.8440193906905744 0.8060459102476979 
70 0.8438314003304445 0.8061464961035814 
71 0.8436407525659697 0.806245907076023 
72 0.8434474008300155 0.8063442608140199 
73 0.8432513451225818 0.806441557317572

In [None]:
#FINAL SUMMARY

# ðŸ”¹ What is Regularization?
# Regularization prevents overfitting by penalizing large model coefficients.

# ðŸ”¹ Why Regularization is Needed

# Dataset has many features
# High correlation
# Linear regression overfits
# Regularization improves generalization

# ðŸ”¹ Key Learning

# Ridge â†’ keeps all features
# Lasso â†’ removes less important features
# Alpha controls model complexity