## Regularisation
    Ridge - Apply penalty on square of coefficients
    Lasso - Apply penalty on Abosulte value of coefficients
    Elastic Net(Ridge and Lasso) - Combination of Ridge and Lasso
    Dropout - Drop some features randomly and train the model

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [1]:
import pandas as pd

url = (
    "https://raw.githubusercontent.com/Sindhura-tr/Datasets/refs/heads/main/Cars93.csv"
)
df = pd.read_csv(url, na_values=["", "NA"], keep_default_na=False)
df.head()

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  93 non-null     int64  
 1   Manufacturer        93 non-null     object 
 2   Model               93 non-null     object 
 3   Type                93 non-null     object 
 4   Min.Price           93 non-null     float64
 5   Price               93 non-null     float64
 6   Max.Price           93 non-null     float64
 7   MPG.city            93 non-null     int64  
 8   MPG.highway         93 non-null     int64  
 9   AirBags             89 non-null     object 
 10  DriveTrain          93 non-null     object 
 11  Cylinders           93 non-null     object 
 12  EngineSize          93 non-null     float64
 13  Horsepower          93 non-null     int64  
 14  RPM                 93 non-null     int64  
 15  Rev.per.mile        93 non-null     int64  
 16  Man.trans.

In [4]:
s = df.isna().sum()
s[s > 0]

AirBags            4
Rear.seat.room     2
Luggage.room      11
dtype: int64

In [5]:
df.duplicated().sum()

np.int64(0)

## Separate X and Y features

In [6]:
X = df.drop(columns=["id", "Weight"])
Y = df[["Weight"]]

In [7]:
cat = list(X.columns[X.dtypes == "object"])
con = list(X.columns[X.dtypes != "object"])

In [8]:
cat

['Manufacturer',
 'Model',
 'Type',
 'AirBags',
 'DriveTrain',
 'Cylinders',
 'Man.trans.avail',
 'Origin',
 'Make']

In [9]:
con

['Min.Price',
 'Price',
 'Max.Price',
 'MPG.city',
 'MPG.highway',
 'EngineSize',
 'Horsepower',
 'RPM',
 'Rev.per.mile',
 'Fuel.tank.capacity',
 'Passengers',
 'Length',
 'Wheelbase',
 'Width',
 'Turn.circle',
 'Rear.seat.room',
 'Luggage.room']

## Feature Engineering: Data preprocessing and Data cleaning

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [11]:
num_pipe = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

In [12]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False),
)

In [13]:
pre = ColumnTransformer([("con", num_pipe, con), ("cat", cat_pipe, cat)]).set_output(
    transform="pandas"
)

In [14]:
pre

In [15]:
X_pre = pre.fit_transform(X)
X_pre.head()

Unnamed: 0,con__Min.Price,con__Price,con__Max.Price,con__MPG.city,con__MPG.highway,con__EngineSize,con__Horsepower,con__RPM,con__Rev.per.mile,con__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
0,-0.485787,-0.37572,-0.282465,0.471312,0.360925,-0.841022,-0.073484,1.717489,1.12953,-1.062184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.388017,1.497844,1.531409,-0.781032,-0.770514,0.515869,1.078322,0.369586,0.005661,0.409445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.008658,0.998227,0.948052,-0.423219,-0.581941,0.128186,0.540813,0.369586,-0.105713,0.072197,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.571949,1.893374,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,1.359872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.755752,1.091905,1.303535,-0.065407,0.172352,0.806631,1.231897,0.706562,0.430909,1.359872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Split the data into training and testing

In [16]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(
    X_pre, Y, train_size=0.75, random_state=42
)

In [17]:
xtrain.head()

Unnamed: 0,con__Min.Price,con__Price,con__Max.Price,con__MPG.city,con__MPG.highway,con__EngineSize,con__Horsepower,con__RPM,con__Rev.per.mile,con__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
33,-0.727198,-0.37572,-0.081936,-0.065407,-0.016221,-0.356418,-0.745372,-1.146804,-0.095588,-0.387687,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,-0.9916,-0.635937,-0.32804,0.471312,1.30379,-0.453339,-0.649388,-0.135877,0.096786,-0.449005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66,0.445367,0.207167,0.009213,-0.244313,-0.581941,0.322027,0.310451,-0.135877,-0.581585,0.56274,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69,0.272931,-0.001007,-0.218661,-0.781032,-1.14766,1.097393,0.502419,-0.809828,-1.300456,1.022624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,-1.003095,-0.979424,-0.911397,0.1135,0.172352,-0.841022,-0.323043,2.054464,0.157535,-1.062184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
xtest.head()

Unnamed: 0,con__Min.Price,con__Price,con__Max.Price,con__MPG.city,con__MPG.highway,con__EngineSize,con__Horsepower,con__RPM,con__Rev.per.mile,con__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
40,-0.014462,0.030219,0.073018,0.292406,0.360925,-0.356418,0.310451,0.87505,1.058655,-0.234393,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22,-1.060574,-1.073102,-1.029891,1.186938,0.738071,-1.131784,-0.99493,1.212025,1.929401,-1.062184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55,-0.060445,-0.042642,-0.018132,-0.781032,-0.959087,0.322027,0.214467,-0.472853,-0.186713,0.899988,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,-1.026087,-1.09392,-1.093695,1.544751,2.246656,-1.034864,-1.340472,0.538074,1.615527,-1.062184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,-0.485787,-0.37572,-0.282465,0.471312,0.360925,-0.841022,-0.073484,1.717489,1.12953,-1.062184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
ytrain.head()

Unnamed: 0,Weight
33,2850
11,2490
66,3200
69,3715
31,2530


In [20]:
ytest.head()

Unnamed: 0,Weight
40,2865
22,2270
55,3735
72,2350
0,2705


## 1. Build Ridge model

In [21]:
from sklearn.linear_model import Ridge, Lasso

In [22]:
base_model = Ridge(alpha=1)
base_model.fit(xtrain, ytrain)

## Evaluation

In [25]:
base_model.score(xtrain, ytrain)

0.9979242816667886

In [24]:
base_model.score(xtest, ytest)

0.9384983120832384

## Hyperparameter tuning

In [31]:
import numpy as np

params = {"alpha": np.arange(start=0.1, stop=100, step=0.1)}
params

{'alpha': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,
         1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,  2.2,
         2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,  3.3,
         3.4,  3.5,  3.6,  3.7,  3.8,  3.9,  4. ,  4.1,  4.2,  4.3,  4.4,
         4.5,  4.6,  4.7,  4.8,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.5,
         5.6,  5.7,  5.8,  5.9,  6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,
         6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,
         7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,
         8.9,  9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9,
        10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11. ,
        11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12. , 12.1,
        12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13. , 13.1, 13.2,
        13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14. , 14.1, 14.2, 14.3,
        14.4, 14.5, 14.6, 14.

## Gridsearch CV
![image.png](attachment:image.png)

In [27]:
from sklearn.model_selection import GridSearchCV

In [32]:
model1 = Ridge()
gscv = GridSearchCV(estimator=model1, param_grid=params, cv=5, scoring="r2")
gscv.fit(xtrain, ytrain)

In [33]:
gscv.scoring

'r2'

In [34]:
gscv.best_params_

{'alpha': np.float64(10.6)}

In [35]:
gscv.best_score_

np.float64(0.923344221449916)

In [46]:
best_ridge = gscv.best_estimator_

In [47]:
best_ridge.fit(xtrain, ytrain)

In [48]:
best_ridge.score(xtrain, ytrain)

0.9770484261937513

In [50]:
best_ridge.score(xtest, ytest)

0.9777892302298112

In [51]:
from sklearn.model_selection import cross_val_score

In [53]:
scores = cross_val_score(best_ridge, xtrain, ytrain, cv=5, scoring="r2")

In [54]:
scores

array([0.92646944, 0.91887761, 0.90101871, 0.91431437, 0.95604098])

In [56]:
score = scores.mean()
score

np.float64(0.923344221449916)

In [37]:
base_model2 = Lasso(alpha=1)

In [38]:
base_model2.fit(xtrain, ytrain)

In [39]:
base_model2.score(xtrain, ytrain)

0.9927620044082287

In [40]:
base_model2.score(xtest, ytest)

0.9272199400787936

In [41]:
model2 = Lasso()
gscv2 = GridSearchCV(estimator=model2, param_grid=params, scoring="r2", cv=5)

In [42]:
gscv2.fit(xtrain, ytrain)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [43]:
gscv2.best_params_

{'alpha': np.float64(2.0)}

In [44]:
gscv2.best_score_

np.float64(0.9147151913952687)

## Ridge model perform better than Lasso model with testing score 97%. Let's consider ridge model for final predictions

In [57]:
url2 = "https://raw.githubusercontent.com/Sindhura-tr/Datasets/refs/heads/main/sample_cars93.csv"
xnew = pd.read_csv(url2, na_values=["", "NA"], keep_default_na=False)

In [58]:
xnew.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,15.0,6,190,106,65,37,31.0,17.0,non-USA,Audi 100
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,15.2,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,16.5,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,20.0,2,169,96,69,37,,,non-USA,Mazda RX-7
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,12.4,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox


In [59]:
xnew_pre = pre.transform(xnew)
xnew_pre.head()

Unnamed: 0,con__Min.Price,con__Price,con__Max.Price,con__MPG.city,con__MPG.highway,con__EngineSize,con__Horsepower,con__RPM,con__Rev.per.mile,con__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
0,1.571949,1.893374,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,-0.510323,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.888138,-0.875337,-0.829362,0.1135,0.360925,-0.647181,-0.649388,-0.135877,0.673908,-0.449005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.428309,-0.37572,-0.318925,-0.244313,-0.016221,-0.453339,-0.649388,-0.135877,0.532158,-0.050439,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.767376,1.352122,0.966282,-0.959938,-0.770514,-1.325626,2.134145,2.054464,-0.014589,1.022624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.968608,-1.083511,-1.130155,0.471312,0.738071,-0.841022,-1.206095,0.369586,0.441034,-1.307455,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [62]:
weights_predicted = best_ridge.predict(xnew_pre)
weights_predicted = weights_predicted.round(2)
weights_predicted

array([3366.91, 2727.97, 3215.01, 3060.61, 2259.77])

In [63]:
xnew["Weights Predicted"] = weights_predicted

In [64]:
xnew.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make,Weights Predicted
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,6,190,106,65,37,31.0,17.0,non-USA,Audi 100,3366.91
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird,2727.97
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina,3215.01
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,2,169,96,69,37,,,non-USA,Mazda RX-7,3060.61
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox,2259.77


In [65]:
xnew.to_csv("Ridge_Weight Predicted Results.csv", index=False)