In [13]:
# Import necessary libraries for data manipulation and machine learning
import numpy as np                 # For numerical operations on arrays and matrices
import pandas as pd                # For data manipulation and analysis
import matplotlib.pyplot as plt    # For creating static visualizations
import seaborn as sns              # For statistical data visualization based on matplotlib

# Import modules for model evaluation and selection
from sklearn.model_selection import train_test_split, cross_val_score    # For splitting data and cross-validation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score    # For model evaluation metrics

# Import machine learning algorithms
import xgboost as xgb                                # For XGBoost regression
import lightgbm as lgb                                # For LightGBM regression
from catboost import CatBoostRegressor               # For CatBoost regression

# Import modules for advanced model stacking techniques
from sklearn.ensemble import StackingRegressor        # For stacking multiple regressors
from mlxtend.regressor import StackingCVRegressor     # For stacked generalization with cross-validation

# Import additional libraries for hyperparameter tuning
import optuna    # For hyperparameter optimization

# Import metrics for additional model evaluation
from sklearn import metrics
# Import category_encoders for encoding categorical features
import category_encoders as ce

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
# Define the path to the CSV file containing the data
path = r'C:\Users\User\Desktop\Rashad\DATA\scrap price.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(path)

# Display the DataFrame to view the loaded data
df

Unnamed: 0,ID,symboling,name,fueltypes,aspiration,doornumbers,carbody,drivewheels,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


In [4]:
# Feature engineering and preprocessing
dum_data = pd.get_dummies(df, drop_first=True)
x_boost = dum_data.drop(columns=['price'])
y_boost = dum_data['price']
X_train_boost, X_test_boost, y_train_boost, y_test_boost = train_test_split(x_boost, y_boost, test_size=0.3, random_state=42)

In [6]:
df_cat = df.copy()
columns_to_fill = df_cat.select_dtypes('object').columns  # Assume these are the categorical columns
df_cat[columns_to_fill] = df_cat[columns_to_fill].fillna('Missing Value')
x_cat = df_cat.drop(columns=['price'])
y_cat = df_cat['price']
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(x_cat, y_cat, test_size=0.3, random_state=42)

In [8]:
df.select_dtypes('object').columns

Index(['name', 'fueltypes', 'aspiration', 'doornumbers', 'carbody',
       'drivewheels', 'enginelocation', 'enginetype', 'cylindernumber',
       'fuelsystem'],
      dtype='object')

In [9]:
models = []

# Define default models for each algorithm
xgb_model_def = xgb.XGBRegressor()
lgb_model_def = lgb.LGBMRegressor()
catboost_model_def = CatBoostRegressor()
catboost_model_custom = CatBoostRegressor(cat_features=['name', 'fueltypes', 'aspiration', 'doornumbers', 'carbody',
       'drivewheels', 'enginelocation', 'enginetype', 'cylindernumber',
       'fuelsystem'])

# Define models for stacking
stacking_models = [('XGBoost', xgb_model_def),
                   ('LightGBM', lgb_model_def),
                   ('CatBoost', catboost_model_def),
                   ('CatBoost_Custom', catboost_model_custom)
                   ]

# Extend models list with default models
models.extend([
    ('XGBoost', xgb_model_def),
    ('LightGBM', lgb_model_def),
    ('CatBoost', catboost_model_def),
    ('CatBoost_Custom', catboost_model_custom)
])

In [10]:
def train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test):
    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict labels on the training data
    y_train_pred = model.predict(X_train)

    # Predict labels on the testing data
    y_test_pred = model.predict(X_test)

    # Calculate evaluation metrics for training data
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)

    # Calculate evaluation metrics for testing data
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    # Print model performance metrics
    print(f'Model Performance for {model_name}')
    print('Training Data:')
    print('Mean Absolute Error:', train_mae)
    print('Mean Squared Error:', train_mse)
    print('R-squared:', train_r2)
    print('Testing Data:')
    print('Mean Absolute Error:', test_mae)
    print('Mean Squared Error:', test_mse)
    print('R-squared:', test_r2)

    return train_r2, test_r2

In [11]:
# Initialize the DataFrame
r2_df = pd.DataFrame(columns=['Model', 'Train R2', 'Test R2'])

# Train and evaluate each model
for model_name, model in models:
    if model_name == 'CatBoost_Custom':
        train_r2, test_r2 = train_and_evaluate_model(model_name, model, X_train_cat, y_train_cat, X_test_cat, y_test_cat)
    else:
        train_r2, test_r2 = train_and_evaluate_model(model_name, model, X_train_boost, y_train_boost, X_test_boost, y_test_boost)
        
    r2_df = pd.concat([r2_df, pd.DataFrame({'Model': [model_name], 'Train R2': [train_r2], 'Test R2': [test_r2]})], ignore_index=True)

r2_df_sorted = r2_df.sort_values(by='Test R2', ascending=False)


Model Performance for XGBoost
Training Data:
Mean Absolute Error: 11.35899256993007
Mean Squared Error: 194.11165315454656
R-squared: 0.9999968152112816
Testing Data:
Mean Absolute Error: 1669.2533233366935
Mean Squared Error: 5918019.470733487
R-squared: 0.9145836393519994
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001506 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 413
[LightGBM] [Info] Number of data points in the train set: 143, number of used features: 25
[LightGBM] [Info] Start training from score 13408.503497
Model Performance for LightGBM
Training Data:
Mean Absolute Error: 880.3236128256416
Mean Squared Error: 1909636.4765294115
R-squared: 0.9686686058877678
Testing Data:
Mean Absolute Error: 1848.6076182878585
Mean Squared Error: 7418125.1130507
R-squared: 0.8929322126901019
Learning rate set to 0.030111
0:	learn: 7670

In [12]:
r2_df_sorted

Unnamed: 0,Model,Train R2,Test R2
2,CatBoost,0.998541,0.929786
3,CatBoost_Custom,0.980721,0.91647
0,XGBoost,0.999997,0.914584
1,LightGBM,0.968669,0.892932
