In [89]:
# Import necessary libraries for data manipulation and machine learning
import numpy as np                 # For numerical operations on arrays and matrices
import pandas as pd                # For data manipulation and analysis
import matplotlib.pyplot as plt    # For creating static visualizations
import seaborn as sns              # For statistical data visualization based on matplotlib

# Import modules for model evaluation and selection
from sklearn.model_selection import train_test_split, cross_val_score    # For splitting data and cross-validation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score    # For model evaluation metrics

# Import machine learning algorithms
from sklearn.linear_model import LinearRegression    # For linear regression
from sklearn.ensemble import RandomForestRegressor  # For random forest regression
import xgboost as xgb                                # For XGBoost regression
import lightgbm as lgb                                # For LightGBM regression
from catboost import CatBoostRegressor               # For CatBoost regression

# Import modules for advanced model stacking techniques
from sklearn.ensemble import StackingRegressor        # For stacking multiple regressors
from mlxtend.regressor import StackingCVRegressor     # For stacked generalization with cross-validation

# Import additional libraries for hyperparameter tuning
import optuna    # For hyperparameter optimization

# Import metrics for additional model evaluation
from sklearn import metrics
# Import category_encoders for encoding categorical features
import category_encoders as ce

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [90]:
# Define the path to the CSV file containing the data
path = r'C:\Users\User\Desktop\Rashad\DATA\CarPrice_Assignment.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(path)

# Set option to display all columns
pd.set_option('display.max_columns', None)

# Display the DataFrame to view the loaded data
df

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,188.8,68.9,55.5,2952,ohc,four,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,188.8,68.8,55.5,3049,ohc,four,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,55.5,3012,ohcv,six,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,188.8,68.9,55.5,3217,ohc,six,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


In [91]:
# Create copies of the DataFrame for different preprocessing approaches
df_copy = df.copy()
df_boost = df.copy()
df_cat = df.copy()

In [92]:
models = ["linear_regression", "random_forest", "xgboost", "lightgbm", "catboost", "catboost_custom"]


# Iterate through each model in the list
for model in models:
    if model == "linear_regression" :
        # Preprocessing for linear regression and SVR
        
        # Fill missing values with mean for numerical columns and mode for categorical columns
        for col in df_copy.columns:
            if df_copy[col].dtype in ['int64', 'float64']:
                df_copy[col] = df_copy[col].fillna(df_copy[col].mean())
            elif df_copy[col].dtype == 'object':
                df_copy[col] = df_copy[col].fillna(df_copy[col].mode().iloc[0])
        df_copy.drop_duplicates(inplace=True)
        df_copy = df_copy.drop(columns=['car_ID','CarName'])
        df_copy['fueltype'] = df_copy['fueltype'].map(lambda x: 1 if x == 'gas' else 0)
        df_copy['aspiration'] = df_copy['aspiration'].map(lambda x: 1 if x == 'std' else 0)
        df_copy['doornumber'] = df_copy['doornumber'].map(lambda x: 1 if x == 'four' else 0)
        df_copy['carbody'] = df_copy['carbody'].map(lambda x: 1 if x == 'sedan' else 0)
        df_copy['drivewheel'] = df_copy['drivewheel'].map(lambda x: 1 if x == 'fwd' else 0)
        df_copy['enginelocation'] = df_copy['enginelocation'].map(lambda x: 1 if x == 'front' else 0)
        df_copy['enginetype'] = df_copy['enginetype'].map(lambda x: 1 if x == 'ohc' else 0)
        df_copy['cylindernumber'] = df_copy['cylindernumber'].map(lambda x: 1 if x == 'four' else 0)
        df_copy['fuelsystem'] = df_copy['fuelsystem'].map(lambda x: 1 if x == 'mpfi' else 0)
    elif model in ["random_forest", "xgboost", "lightgbm", "catboost"]:
        for col in df_boost.columns:
            if df_boost[col].dtype in ['int64', 'float64']:
                df_boost[col] = df_boost[col].fillna(df_boost[col].mean())
            elif df_boost[col].dtype == 'object':
                df_boost[col] = df_boost[col].fillna(df_boost[col].mode().iloc[0])
        dum_data = pd.get_dummies(df_boost, drop_first=True)
             
    elif model == "catboost_custom":
        columns_to_fill = ['CarName', 'fueltype', 'aspiration', 'doornumber', 'carbody',
       'drivewheel', 'enginelocation', 'enginetype', 'cylindernumber',
       'fuelsystem']  # Assume these are the categorical columns
        df_cat[columns_to_fill] = df_cat[columns_to_fill].fillna('Missing Value')
        

# Split data into features (X) and target (y) for each preprocessing approach
x_log = df_copy.drop(columns=['price'])
y_log = df_copy['price']
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x_log, y_log, test_size=0.3, random_state=42)

x_boost = dum_data.drop(columns=['price'])
y_boost = dum_data['price']
X_train_boost, X_test_boost, y_train_boost, y_test_boost = train_test_split(x_boost, y_boost, test_size=0.3, random_state=42)

x_cat = df_cat.drop(columns=['price'])
y_cat = df_cat['price']
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(x_cat, y_cat, test_size=0.3, random_state=42)

In [93]:
models = []

# Define default models for each algorithm
xgb_model_def = xgb.XGBRegressor()
lgb_model_def = lgb.LGBMRegressor()
catboost_model_def = CatBoostRegressor()
catboost_model_custom = CatBoostRegressor(cat_features=['CarName', 'fueltype', 'aspiration', 'doornumber', 'carbody',
       'drivewheel', 'enginelocation', 'enginetype', 'cylindernumber',
       'fuelsystem'])
lg = LinearRegression()
rf = RandomForestRegressor()
# Define models for stacking
stacking_models = [('XGBoost', xgb_model_def),
                   ('LightGBM', lgb_model_def),
                   ('CatBoost', catboost_model_def),
                   ('CatBoost_Custom', catboost_model_custom),
                   ('LinearRegression', lg),
                   ('RandomForest', rf),]  # Add SVR to stacking_models

# Extend models list with default models
models.extend([
    ('XGBoost', xgb_model_def),
    ('LightGBM', lgb_model_def),
    ('CatBoost', catboost_model_def),
    ('CatBoost_Custom', catboost_model_custom),
    ('LinearRegression', lg),
    ('RandomForest', rf)
])

In [94]:
def train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test):
    """
    Train and evaluate the given model on the training and testing data.

    Parameters:
    model_name (str): Name of the model for display purposes.
    model : Machine learning model object.
    X_train : Features of the training data.
    y_train : Target labels of the training data.
    X_test : Features of the testing data.
    y_test : Target labels of the testing data.

    Returns:
    float: R-squared score calculated from the model's predictions.
    """

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict labels on the testing data
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Print model performance metrics
    print(f'Model Performance for {model_name}')
    print('Mean Absolute Error:', mae)
    print('Mean Squared Error:', mse)
    print('R-squared:', r2)

    return r2

In [95]:
r2_df = pd.DataFrame(columns=['Model', 'R2'])

# Iterate through each model in the list of models
for model_name, model in models:
    # Train and evaluate the model, and calculate the R-squared score
    if model_name == 'CatBoost_Custom':
        r2_score_value = train_and_evaluate_model(model_name, model, X_train_cat, y_train_cat, X_test_cat, y_test_cat)
    elif model_name in ['LinearRegression']:
        r2_score_value = train_and_evaluate_model(model_name, model, X_train_log, y_train_log, X_test_log, y_test_log)
    else:
        r2_score_value = train_and_evaluate_model(model_name, model, X_train_boost, y_train_boost, X_test_boost, y_test_boost)
        
    # Add model name and R2 score to the DataFrame
    if r2_score_value is not None:
        r2_df = pd.concat([r2_df, pd.DataFrame({'Model': [model_name], 'R2': [r2_score_value]})], ignore_index=True)

# Sort the DataFrame by R2 score in descending order
r2_df_sorted = r2_df.sort_values(by='R2', ascending=False)

Model Performance for XGBoost
Mean Absolute Error: 1669.2533233366935
Mean Squared Error: 5918019.470733487
R-squared: 0.9145836393519994
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000102 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 413
[LightGBM] [Info] Number of data points in the train set: 143, number of used features: 25
[LightGBM] [Info] Start training from score 13408.503497
Model Performance for LightGBM
Mean Absolute Error: 1848.6076182878585
Mean Squared Error: 7418125.1130507
R-squared: 0.8929322126901019
Learning rate set to 0.030111
0:	learn: 7670.3933300	total: 2.25ms	remaining: 2.25s
1:	learn: 7521.7002372	total: 4.2ms	remaining: 2.1s
2:	learn: 7377.2006599	total: 6.08ms	remaining: 2.02s
3:	learn: 7268.2462421	total: 8.11ms	remaining: 2.02s
4:	learn: 7149.1881740	total: 10.1ms	remaining: 2.02s
5:	learn: 7029.2827576	total: 12ms	remaining: 2s
6:	learn: 6898.2423096	total: 14.1m

In [96]:
r2_df_sorted

Unnamed: 0,Model,R2
5,RandomForest,0.934894
2,CatBoost,0.929786
3,CatBoost_Custom,0.91647
0,XGBoost,0.914584
1,LightGBM,0.892932
4,LinearRegression,0.864256
