## Feature Selection
![image-2.png](attachment:image-2.png)

## Read the Dataset

In [2]:
import pandas as pd
df = pd.read_csv(
    "https://raw.githubusercontent.com/Sindhura-tr/Datasets/refs/heads/main/Cars93.csv",
    na_values=["", "NA"],
    keep_default_na=False,
)
df.head()

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


## perform  Basic data quality checks

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  93 non-null     int64  
 1   Manufacturer        93 non-null     object 
 2   Model               93 non-null     object 
 3   Type                93 non-null     object 
 4   Min.Price           93 non-null     float64
 5   Price               93 non-null     float64
 6   Max.Price           93 non-null     float64
 7   MPG.city            93 non-null     int64  
 8   MPG.highway         93 non-null     int64  
 9   AirBags             89 non-null     object 
 10  DriveTrain          93 non-null     object 
 11  Cylinders           93 non-null     object 
 12  EngineSize          93 non-null     float64
 13  Horsepower          93 non-null     int64  
 14  RPM                 93 non-null     int64  
 15  Rev.per.mile        93 non-null     int64  
 16  Man.trans.

In [4]:
df.shape

(93, 28)

In [5]:
df.isna().sum()

id                     0
Manufacturer           0
Model                  0
Type                   0
Min.Price              0
Price                  0
Max.Price              0
MPG.city               0
MPG.highway            0
AirBags                4
DriveTrain             0
Cylinders              0
EngineSize             0
Horsepower             0
RPM                    0
Rev.per.mile           0
Man.trans.avail        0
Fuel.tank.capacity     0
Passengers             0
Length                 0
Wheelbase              0
Width                  0
Turn.circle            0
Rear.seat.room         2
Luggage.room          11
Weight                 0
Origin                 0
Make                   0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(0)

## Seperate x & y

In [7]:
x=df.drop(columns=["Weight","id"])
y=df[["Weight"]]

In [8]:
x.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25,31,,Front,...,13.2,5,177,102,68,37,26.5,11.0,non-USA,Acura Integra
1,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,Front,...,18.0,5,195,115,71,38,30.0,15.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,Front,...,16.9,5,180,102,67,37,28.0,14.0,non-USA,Audi 90
3,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,21.1,6,193,106,70,37,31.0,17.0,non-USA,Audi 100
4,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,Rear,...,21.1,4,186,109,69,39,27.0,13.0,non-USA,BMW 535i


In [9]:
y.head()

Unnamed: 0,Weight
0,2705
1,3560
2,3375
3,3405
4,3640


## Data Preprocessing and data cleaning

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

## Separate categorical and continuous data

In [11]:
cat = list(x.columns[x.dtypes=="object"])
con = list(x.columns[x.dtypes!="object"])

In [12]:
print(cat)

['Manufacturer', 'Model', 'Type', 'AirBags', 'DriveTrain', 'Cylinders', 'Man.trans.avail', 'Origin', 'Make']


In [13]:
print(con)

['Min.Price', 'Price', 'Max.Price', 'MPG.city', 'MPG.highway', 'EngineSize', 'Horsepower', 'RPM', 'Rev.per.mile', 'Fuel.tank.capacity', 'Passengers', 'Length', 'Wheelbase', 'Width', 'Turn.circle', 'Rear.seat.room', 'Luggage.room']


In [14]:
num_pipe = make_pipeline(SimpleImputer(strategy="mean"),StandardScaler())

In [15]:
cat_pipe = make_pipeline(SimpleImputer(strategy="most_frequent"),OrdinalEncoder())

In [16]:
pre = ColumnTransformer([("con",num_pipe , con),("cat",cat_pipe,cat)]).set_output(
    transform="pandas"
)

In [17]:
x_pre = pre.fit_transform(x)
x_pre.head()

Unnamed: 0,con__Min.Price,con__Price,con__Max.Price,con__MPG.city,con__MPG.highway,con__EngineSize,con__Horsepower,con__RPM,con__Rev.per.mile,con__Fuel.tank.capacity,...,con__Luggage.room,cat__Manufacturer,cat__Model,cat__Type,cat__AirBags,cat__DriveTrain,cat__Cylinders,cat__Man.trans.avail,cat__Origin,cat__Make
0,-0.485787,-0.37572,-0.282465,0.471312,0.360925,-0.841022,-0.073484,1.717489,1.12953,-1.062184,...,-1.033015,0.0,48.0,3.0,2.0,1.0,1.0,1.0,1.0,0.0
1,1.388017,1.497844,1.531409,-0.781032,-0.770514,0.515869,1.078322,0.369586,0.005661,0.409445,...,0.396643,0.0,55.0,2.0,0.0,1.0,3.0,1.0,1.0,1.0
2,1.008658,0.998227,0.948052,-0.423219,-0.581941,0.128186,0.540813,0.369586,-0.105713,0.072197,...,0.039228,1.0,8.0,0.0,1.0,1.0,3.0,1.0,1.0,3.0
3,1.571949,1.893374,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,1.359872,...,1.111472,1.0,0.0,2.0,1.0,1.0,3.0,1.0,1.0,2.0
4,0.755752,1.091905,1.303535,-0.065407,0.172352,0.806631,1.231897,0.706562,0.430909,1.359872,...,-0.318186,2.0,5.0,2.0,1.0,2.0,1.0,1.0,1.0,4.0


## Feature Selection => Backward ELimination

In [18]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

In [19]:
base_model = LinearRegression()
for_sel = SequentialFeatureSelector(
    base_model,direction="backward", n_features_to_select=10

)
for_sel.fit(x_pre,y)

In [20]:
imp_cols = for_sel.get_feature_names_out()
imp_cols

array(['con__MPG.highway', 'con__Horsepower', 'con__RPM',
       'con__Passengers', 'con__Length', 'con__Wheelbase',
       'cat__Manufacturer', 'cat__Type', 'cat__Cylinders', 'cat__Origin'],
      dtype=object)

In [21]:
imp_cols[0
]

'con__MPG.highway'

In [22]:

imp_cols[0].split("_")

['con', '', 'MPG.highway']

In [23]:
imp_cols[0].split("_")[-1]

'MPG.highway'

In [24]:
imp_features = []
for i in imp_cols:
    fea = i.split("_")[-1]
    imp_features.append(fea)

In [25]:
imp_features

['MPG.highway',
 'Horsepower',
 'RPM',
 'Passengers',
 'Length',
 'Wheelbase',
 'Manufacturer',
 'Type',
 'Cylinders',
 'Origin']

In [26]:
x_sel = x[imp_features]
x_sel.head(2)

Unnamed: 0,MPG.highway,Horsepower,RPM,Passengers,Length,Wheelbase,Manufacturer,Type,Cylinders,Origin
0,31,140,6300,5,177,102,Acura,Small,4,non-USA
1,25,200,5500,5,195,115,Acura,Midsize,6,non-USA


## Data Preprocessing - 2nd stage

In [27]:
x_sel_cat = list(x_sel.columns[x_sel.dtypes == "object"])
x_sel_con = list(x_sel.columns[x_sel.dtypes != "object"])

In [28]:
cat_sel_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False),
)

In [29]:
con_sel_pipe = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

In [30]:
pre_sel = ColumnTransformer(
    [("con", con_sel_pipe, x_sel_con), ("cat", cat_sel_pipe, x_sel_cat)]
).set_output(transform="pandas")

In [31]:
pre_sel

In [32]:
x_sel_pre = pre_sel.fit_transform(x_sel)
x_sel_pre.head(2)

Unnamed: 0,con__MPG.highway,con__Horsepower,con__RPM,con__Passengers,con__Length,con__Wheelbase,cat__Manufacturer_Acura,cat__Manufacturer_Audi,cat__Manufacturer_BMW,cat__Manufacturer_Buick,...,cat__Type_Sporty,cat__Type_Van,cat__Cylinders_3,cat__Cylinders_4,cat__Cylinders_5,cat__Cylinders_6,cat__Cylinders_8,cat__Cylinders_rotary,cat__Origin_USA,cat__Origin_non-USA
0,0.360925,-0.073484,1.717489,-0.083243,-0.427186,-0.286932,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.770514,1.078322,0.369586,-0.083243,0.812171,1.629649,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


## Train Test split

In [33]:

from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(
    x_sel_pre, y, train_size=0.8, random_state=21
)

In [34]:

xtrain.shape

(74, 52)

In [35]:

xtest.shape

(19, 52)

In [36]:

xtrain.head()

Unnamed: 0,con__MPG.highway,con__Horsepower,con__RPM,con__Passengers,con__Length,con__Wheelbase,cat__Manufacturer_Acura,cat__Manufacturer_Audi,cat__Manufacturer_BMW,cat__Manufacturer_Buick,...,cat__Type_Sporty,cat__Type_Van,cat__Cylinders_3,cat__Cylinders_4,cat__Cylinders_5,cat__Cylinders_6,cat__Cylinders_8,cat__Cylinders_rotary,cat__Origin_USA,cat__Origin_non-USA
57,-0.016221,-0.265452,-0.304365,-0.083243,-0.564892,0.155356,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
31,0.172352,-0.323043,2.054464,-0.083243,-0.840305,-0.87665,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
62,-0.959087,1.116716,1.212025,-0.083243,0.467905,0.450214,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
29,-0.204794,1.347077,0.87505,0.884457,1.294143,1.334791,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
51,-0.581941,1.27029,-1.146804,0.884457,2.464647,1.924508,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [37]:

xtest.head()

Unnamed: 0,con__MPG.highway,con__Horsepower,con__RPM,con__Passengers,con__Length,con__Wheelbase,cat__Manufacturer_Acura,cat__Manufacturer_Audi,cat__Manufacturer_BMW,cat__Manufacturer_Buick,...,cat__Type_Sporty,cat__Type_Van,cat__Cylinders_3,cat__Cylinders_4,cat__Cylinders_5,cat__Cylinders_6,cat__Cylinders_8,cat__Cylinders_rotary,cat__Origin_USA,cat__Origin_non-USA
23,-0.016221,-0.975733,-0.809828,-0.083243,-0.771452,-1.024079,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
86,-1.336233,-0.111878,-0.472853,1.852158,0.261346,1.334791,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
91,-0.204794,-0.572601,0.201098,-0.083243,0.467905,0.007926,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
21,-0.581941,0.060893,-0.809828,0.884457,1.362996,0.892502,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
17,-0.581941,0.502419,-1.820755,0.884457,2.120381,1.777079,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [38]:
ytrain.head()

Unnamed: 0,Weight
57,2920
31,2530
62,3730
29,3490
51,4055


## Build the model

In [39]:
model = LinearRegression()
model.fit(xtrain, ytrain)

## Model Evaluation

In [40]:
model.score(xtrain, ytrain)

0.9847955363381506

In [41]:

model.score(xtest, ytest)

0.9456792138598572

In [42]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

ypred_train = model.predict(xtrain)
mse = mean_squared_error(ytrain, ypred_train)
mae = mean_absolute_error(ytrain, ypred_train)
r2 = r2_score(ytrain, ypred_train)

In [43]:
print("Training Scores")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R2: {r2}")

Training Scores
MSE: 5114.342115202194
MAE: 51.49019611951663
R2: 0.9847955363381506


In [44]:

ypred_test = model.predict(xtest)
mse = mean_squared_error(ytest, ypred_test)
mae = mean_absolute_error(ytest, ypred_test)
r2 = r2_score(ytest, ypred_test)

print("Testing scores")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R2: {r2}")

Testing scores
MSE: 19701.983612628825
MAE: 104.40555966529897
R2: 0.9456792138598572


## This model is performing well in training as well in testing. We can this model for out of sample predictions