### 1 Loading and Preprocessing

In [1]:
# Loading Data Set
import pandas as pd
import numpy as np

data = pd.read_csv('CarPrice_Assignment.csv')

data.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [4]:
# Preprocessing

data.isnull().sum()

data = data.dropna()  

data.duplicated().sum()

data = data.drop_duplicates()

data = pd.get_dummies(data)

### 2 Model Implementation

In [35]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Support Vector Regressor": SVR()
}

In [7]:
# 1) Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = data.drop('price', axis=1)
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X_train, y_train)

y_pred_linear = model_LinearRegression.predict(X_test)

# Evaluate performance using R-squared, MSE, and MAE
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
r2 = r2_score(y_test, y_pred_linear)
mse = mean_squared_error(y_test, y_pred_linear)
mae = mean_absolute_error(y_test, y_pred_linear)

print(f"Linear Regression R-squared: {r2}")
print(f"Linear Regression MSE: {mse}")
print(f"Linear Regression MAE: {mae}")

Linear Regression R-squared: -1.2611894535162156
Linear Regression MSE: 178507387.43124586
Linear Regression MAE: 7036.822888063449


In [9]:
# 2) Decision Tree Regressor

from sklearn.tree import DecisionTreeRegressor

model_DecisionTreeRegressor = DecisionTreeRegressor(random_state=42)
model_DecisionTreeRegressor.fit(X_train, y_train)

y_pred_tree = model_DecisionTreeRegressor.predict(X_test)

r2= r2_score(y_test, y_pred_tree)
mse = mean_squared_error(y_test, y_pred_tree)
mae = mean_absolute_error(y_test, y_pred_tree)

print(f"Decision Tree Regressor R-squared: {r2}")
print(f"Decision Tree Regressor MSE: {mse}")
print(f"Decision Tree Regressor MAE: {mae}")

Decision Tree Regressor R-squared: 0.8487346784800848
Decision Tree Regressor MSE: 11941492.700436316
Decision Tree Regressor MAE: 2078.4309024390245


In [11]:
# 3) Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

model_RandomForestRegressor = RandomForestRegressor(n_estimators=100, random_state=42)
model_RandomForestRegressor.fit(X_train, y_train)

y_pred_rf = model_RandomForestRegressor.predict(X_test)

r2 = r2_score(y_test, y_pred_rf)
mse = mean_squared_error(y_test, y_pred_rf)
mae = mean_absolute_error(y_test, y_pred_rf)

print(f"Random Forest Regressor R-squared: {r2}")
print(f"Random Forest Regressor MSE: {mse}")
print(f"Random Forest Regressor MAE: {mae}")

Random Forest Regressor R-squared: 0.9548659727284703
Random Forest Regressor MSE: 3563061.5913066827
Random Forest Regressor MAE: 1344.8036097560976


In [13]:
# 4) Gradient Boosting Regressor

from sklearn.ensemble import GradientBoostingRegressor

model_GradientBoostingRegressor = GradientBoostingRegressor(n_estimators=100, random_state=42)
model_GradientBoostingRegressor.fit(X_train, y_train)

y_pred_gb = model_GradientBoostingRegressor.predict(X_test)

r2 = r2_score(y_test, y_pred_gb)
mse = mean_squared_error(y_test, y_pred_gb)
mae = mean_absolute_error(y_test, y_pred_gb)

print(f"Gradient Boosting Regressor R-squared: {r2}")
print(f"Gradient Boosting Regressor MSE: {mse}")
print(f"Gradient Boosting Regressor MAE: {mae}")

Gradient Boosting Regressor R-squared: 0.9344779157332396
Gradient Boosting Regressor MSE: 5172576.7884338265
Gradient Boosting Regressor MAE: 1617.6610080371565


In [15]:
# 5) Support Vector Regressor

from sklearn.svm import SVR

model_SVR = SVR(kernel='rbf')
model_SVR.fit(X_train, y_train)

y_pred_svr = model_SVR.predict(X_test)

r2 = r2_score(y_test, y_pred_svr)
mse = mean_squared_error(y_test, y_pred_svr)
mae = mean_absolute_error(y_test, y_pred_svr)

print(f"Support Vector Regressor R-squared: {r2}")
print(f"Support Vector Regressor MSE: {mse}")
print(f"Support Vector Regressor MAE: {mae}")

Support Vector Regressor R-squared: -0.10198909696462466
Support Vector Regressor MSE: 86995450.27993043
Support Vector Regressor MAE: 5707.168361260932


### 3 Model Evaluation

In [39]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, mae, r2

In [41]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

results = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    
    mse = mean_squared_error(y_test, y_pred)
    
    mae = mean_absolute_error(y_test, y_pred)
    
    results[model_name] = {
        "R²": r2,
        "MSE": mse,
        "MAE": mae
    }

results_df = pd.DataFrame(results)

print("Model Evaluation Results:")
print(results_df)

Model Evaluation Results:
                                R²           MSE          MAE
Linear Regression        -1.261189  1.785074e+08  7036.822888
Decision Tree             0.849184  1.190600e+07  2141.967488
Random Forest             0.953936  3.636476e+06  1356.734951
Gradient Boosting         0.935617  5.082652e+06  1625.930772
Support Vector Regressor -0.101989  8.699545e+07  5707.168361


### 4 Feature Importance Analysis 

In [26]:
from sklearn.feature_selection import RFE, SelectKBest

model = LinearRegression()
rfe = RFE(model, n_features_to_select=13) 
rfe.fit(X, y)

selected_features_rfe = X.columns[rfe.support_]

print("\n=== Selected Features for RFE ===")
print(selected_features_rfe.tolist())


=== Selected Features for RFE ===
['CarName_chevrolet impala', 'CarName_jaguar xk', 'drivewheel_rwd', 'enginelocation_rear', 'enginetype_dohc', 'enginetype_dohcv', 'enginetype_l', 'enginetype_ohc', 'enginetype_ohcf', 'cylindernumber_eight', 'cylindernumber_four', 'cylindernumber_twelve', 'fuelsystem_4bbl']


In [28]:
selector_reg = SelectKBest(k=5)   
X_reg_selected = selector_reg.fit_transform(X, y)
selected_fetures_reg = X.columns[selector_reg.get_support()]

print("\n=== Selected Features for Regression ===")
print(selected_features_reg)


=== Selected Features for Regression ===
Index(['cylindernumber_twelve', 'cylindernumber_two', 'fuelsystem_4bbl',
       'fuelsystem_mfi', 'fuelsystem_spfi'],
      dtype='object')


  f = msb / msw


In [31]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

feature_importance = rf_model.feature_importances_

feature_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

print(feature_data)

                              Feature  Importance
7                          enginesize    0.551711
6                          curbweight    0.265877
14                         highwaympg    0.064909
11                         horsepower    0.032951
0                              car_ID    0.016822
..                                ...         ...
29                     CarName_bmw x5    0.000000
132  CarName_toyota corolla 1600 (sw)    0.000000
120               CarName_subaru baja    0.000000
126            CarName_subaru tribeca    0.000000
28                     CarName_bmw x4    0.000000

[200 rows x 2 columns]


### 5 Hyperparameter Tuning

In [48]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

rf_model = RandomForestRegressor()

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters: ", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_

y_pred_best_rf = best_rf_model.predict(X_test)
print("Best Random Forest Model R2: ", r2_score(y_test, y_pred_best_rf))


Best Parameters:  {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 50}
Best Random Forest Model R2:  0.9529084881943272


In [44]:
from sklearn.model_selection import GridSearchCV

print("Best parameters for Random Forest:", grid_search_rf.best_params_)

y_pred_rf_tuned = grid_search_rf.best_estimator_.predict(X_test)
mse_rf_tuned, mae_rf_tuned, r2_rf_tuned = evaluate_model(y_test, y_pred_rf_tuned)
print(f"R2 score after tuning: {r2_rf_tuned}")

Best parameters for Random Forest: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
R2 score after tuning: 0.9556628712146326
