In [12]:
#Loading and Preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
url = "https://drive.google.com/uc?id=1FHmYNLs9v0Enc-UExEMpitOFGsWvB2dP"
df = pd.read_csv(url)

# Display basic info and head of the dataset
print(df.info())
print(df.head())

# Handle missing values for numeric columns only
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Convert categorical variables to numeric (including missing values if any)
df = pd.get_dummies(df, drop_first=True)

# Split the data into features and target variable
X = df.drop('price', axis=1)
y = df['price']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [14]:
#Model Implementation
from sklearn.linear_model import LinearRegression

# Linear Regression Model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [16]:
from sklearn.tree import DecisionTreeRegressor

# Decision Tree Regressor Model
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)


In [18]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest Regressor Model
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


In [20]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting Regressor Model
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, y_train)
y_pred_gbr = gbr.predict(X_test)


In [22]:
from sklearn.svm import SVR

# Support Vector Regressor Model
svr = SVR()
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_test)


In [24]:
#Model Evaluation
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

models = {
    'Linear Regression': y_pred_lr,
    'Decision Tree Regressor': y_pred_dt,
    'Random Forest Regressor': y_pred_rf,
    'Gradient Boosting Regressor': y_pred_gbr,
    'Support Vector Regressor': y_pred_svr
}

for model_name, y_pred in models.items():
    print(f"{model_name}:")
    print(f"R-squared: {r2_score(y_test, y_pred):.4f}")
    print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.4f}")
    print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.4f}")
    print("\n")


Linear Regression:
R-squared: -5092680174140201573023744.0000
Mean Squared Error: 402036650000742768564696237735936.0000
Mean Absolute Error: 11096924181087860.0000


Decision Tree Regressor:
R-squared: 0.8666
Mean Squared Error: 10532678.5297
Mean Absolute Error: 2098.3090


Random Forest Regressor:
R-squared: 0.9537
Mean Squared Error: 3652007.2005
Mean Absolute Error: 1378.8925


Gradient Boosting Regressor:
R-squared: 0.9316
Mean Squared Error: 5402849.3765
Mean Absolute Error: 1685.6164


Support Vector Regressor:
R-squared: -0.1017
Mean Squared Error: 86973995.1459
Mean Absolute Error: 5705.0610




In [26]:
#Feature Importance Analysis
# Using Random Forest to determine feature importance
importances = rf.feature_importances_
feature_names = X.columns

# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df)


                              Feature  Importance
7                          enginesize    0.549018
6                          curbweight    0.293698
14                         highwaympg    0.041858
11                         horsepower    0.028179
0                              car_ID    0.019694
..                                ...         ...
131  CarName_toyota corolla 1600 (sw)    0.000000
60        CarName_isuzu D-Max V-Cross    0.000000
143          CarName_vokswagen rabbit    0.000000
93               CarName_nissan nv200    0.000000
119               CarName_subaru baja    0.000000

[190 rows x 2 columns]


In [28]:
#Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize the grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# Predict with the best model
y_pred_best_rf = best_rf.predict(X_test)

# Evaluate the best model
print(f"Best Model R-squared: {r2_score(y_test, y_pred_best_rf):.4f}")
print(f"Best Model Mean Squared Error: {mean_squared_error(y_test, y_pred_best_rf):.4f}")
print(f"Best Model Mean Absolute Error: {mean_absolute_error(y_test, y_pred_best_rf):.4f}")


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best Model R-squared: 0.9425
Best Model Mean Squared Error: 4541322.2821
Best Model Mean Absolute Error: 1485.9455
