**Loading and Preprocessing**

In [39]:
import pandas as pd
# Load the dataset
car_data=pd.read_csv('CarPrice_Assignment.csv')
#display the data information
car_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [40]:
#display first 10 rows
car_data.head(10)

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0
5,6,2,audi fox,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250.0
6,7,1,audi 100ls,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710.0
7,8,1,audi 5000,gas,std,four,wagon,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,18920.0
8,9,1,audi 4000,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140,5500,17,20,23875.0
9,10,0,audi 5000s (diesel),gas,turbo,two,hatchback,4wd,front,99.5,...,131,mpfi,3.13,3.4,7.0,160,5500,16,22,17859.167


In [41]:
#Drop irrelevant column
car_data.drop(columns=['car_ID'], inplace=True)

In [42]:
#extract car brand from car name column
car_data['CarBrand']=car_data['CarName'].apply(lambda x:x.split(' ')[0].lower())
car_data.drop(columns=['CarName'], inplace=True)

In [43]:
#Encode categorical variables using one-hot encoding
categorical_columns = [
    'fueltype', 'aspiration', 'doornumber', 'carbody',
    'drivewheel', 'enginelocation', 'enginetype', 
    'cylindernumber', 'fuelsystem', 'CarBrand'
]
car_data_encoded=pd.get_dummies(car_data, columns=categorical_columns, drop_first=True)

In [44]:
missing_values=car_data_encoded.isnull().sum()
missing_values

symboling              0
wheelbase              0
carlength              0
carwidth               0
carheight              0
                      ..
CarBrand_toyouta       0
CarBrand_vokswagen     0
CarBrand_volkswagen    0
CarBrand_volvo         0
CarBrand_vw            0
Length: 70, dtype: int64

**Model Implementation**

In [46]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
numerical_features = [
    'symboling', 'wheelbase', 'carlength', 'carwidth', 
    'carheight', 'curbweight', 'enginesize', 'boreratio', 
    'stroke', 'compressionratio', 'horsepower', 'peakrpm', 
    'citympg', 'highwaympg'
]
car_data_encoded[numerical_features]=scaler.fit_transform(car_data_encoded[numerical_features])
missing_values,car_data_encoded.shape

(symboling              0
 wheelbase              0
 carlength              0
 carwidth               0
 carheight              0
                       ..
 CarBrand_toyouta       0
 CarBrand_vokswagen     0
 CarBrand_volkswagen    0
 CarBrand_volvo         0
 CarBrand_vw            0
 Length: 70, dtype: int64,
 (205, 70))

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [49]:
# Split the data into training and testing sets
X=car_data_encoded.drop(columns=['price'])
y=car_data_encoded['price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
X_train_scaled=scaler.fit_transform(X_train) # Fit and transform training data
X_test_scaled = scaler.transform(X_test)

In [50]:
# Initialize models
models={"Linear Regression":LinearRegression(),
 "Decision Tree Regressor":DecisionTreeRegressor(random_state=42),
"Random Forest Regressor":RandomForestRegressor(random_state=42),
"Gradient Boosting Regressor":GradientBoostingRegressor(random_state=42),
"Support Vector Regressor":SVR()}

**Model Evaluation**

In [52]:
#Train and evaluate the models
results={}
for model_name,model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred=model.predict(X_test_scaled)
    results[model_name]={
        "R-squared":r2_score(y_test,y_pred),
        "MSE":mean_squared_error(y_test,y_pred),
        "MAE":mean_absolute_error(y_test,y_pred)
    }
results_df=pd.DataFrame(results).T
results_df.sort_values(by="R-squared",ascending=False)

Unnamed: 0,R-squared,MSE,MAE
Random Forest Regressor,0.9575088,3354425.0,1297.759
Gradient Boosting Regressor,0.9285843,5637845.0,1686.943
Decision Tree Regressor,0.8837301,9178813.0,2070.187
Support Vector Regressor,-0.1006857,86892560.0,5701.502
Linear Regression,-4.661681e+22,3.680119e+30,299598000000000.0


**Feature Importance Analysis**

In [54]:
best_model=models["Random Forest Regressor"]
feature_importances= pd.Series(best_model.feature_importances_,index=X.columns).sort_values(ascending=False)

In [55]:
top_features=feature_importances.head()
top_features

enginesize    0.556220
curbweight    0.296141
highwaympg    0.044554
horsepower    0.024115
carwidth      0.013527
dtype: float64

**Hyperparameter Tuning**

In [57]:
#Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Random Forest Regressor
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Perform Grid Search with 3-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, scoring='r2', verbose=1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Best parameters and score
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the tuned model on the test set
y_pred_tuned = best_model.predict(X_test)
r2_tuned = r2_score(y_test, y_pred_tuned)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
mae_tuned = mean_absolute_error(y_test, y_pred_tuned)

# Output results
best_params, r2_tuned, mse_tuned, mae_tuned


Fitting 3 folds for each of 108 candidates, totalling 324 fits


({'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 10,
  'n_estimators': 100},
 0.9431549093869716,
 4487580.020260946,
 1465.937703500198)