In [1]:
# Import necessary libraries for data manipulation and machine learning
import numpy as np                 # For numerical operations on arrays and matrices
import pandas as pd                # For data manipulation and analysis
import matplotlib.pyplot as plt    # For creating static visualizations
import seaborn as sns              # For statistical data visualization based on matplotlib

# Import modules for model evaluation and selection
from sklearn.model_selection import train_test_split, cross_val_score    # For splitting data and cross-validation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score    # For model evaluation metrics

# Import machine learning algorithms
from sklearn.linear_model import LinearRegression    # For linear regression
from sklearn.ensemble import RandomForestRegressor  # For random forest regression
from sklearn.svm import SVR                          # For support vector regression
import xgboost as xgb                                # For XGBoost regression
import lightgbm as lgb                                # For LightGBM regression
from catboost import CatBoostRegressor               # For CatBoost regression

# Import modules for advanced model stacking techniques
from sklearn.ensemble import StackingRegressor        # For stacking multiple regressors
from mlxtend.regressor import StackingCVRegressor     # For stacked generalization with cross-validation

# Import additional libraries for hyperparameter tuning
import optuna    # For hyperparameter optimization

# Import metrics for additional model evaluation
from sklearn import metrics
# Import category_encoders for encoding categorical features
import category_encoders as ce

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
# Define the path to the CSV file containing the data
path = r'C:\Users\User\Desktop\Rashad\DATA\Housing.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(path)

# Display the DataFrame to view the loaded data
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [7]:
# Create copies of the DataFrame for different preprocessing approaches
df_copy = df.copy()
df_boost = df.copy()
df_cat = df.copy()
df_rf = df.copy()

In [18]:
models = ["linear_regression", "random_forest", "xgboost", "lightgbm", "catboost", "catboost_custom"]


# Iterate through each model in the list
for model in models:
    if model == "linear_regression":
        # Preprocessing for linear regression and SVR
        
        # Fill missing values with mean for numerical columns and mode for categorical columns
        for col in df_copy.columns:
            if df_copy[col].dtype in ['int64', 'float64']:
                df_copy[col] = df_copy[col].fillna(df_copy[col].mean())
            elif df_copy[col].dtype == 'object':
                df_copy[col] = df_copy[col].fillna(df_copy[col].mode().iloc[0])
        df_copy.drop_duplicates(inplace=True)
        df_copy = pd.get_dummies(df_copy, drop_first=True)        
    elif model in ["xgboost", "lightgbm", "catboost"]:
        
        dum_data = pd.get_dummies(df_boost, drop_first=True)
    elif model == "random_forest":
        if df_rf[col].dtype in ['int64', 'float64']:
                df_rf[col] = df_rf[col].fillna(df_rf[col].mean())
        elif df_rf[col].dtype == 'object':
                df_rf[col] = df_rf[col].fillna(df_rf[col].mode().iloc[0])
        df_rf = pd.get_dummies(df_rf, drop_first=True)        
   
    elif model == "catboost_custom":
        columns_to_fill = ['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus']  # Assume these are the categorical columns
        df_cat[columns_to_fill] = df_cat[columns_to_fill].fillna('Missing Value')
        

# Split data into features (X) and target (y) for each preprocessing approach
x_log = df_copy.drop(columns=['price'])
y_log = df_copy['price']
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x_log, y_log, test_size=0.3, random_state=42)

x_boost = dum_data.drop(columns=['price'])
y_boost = dum_data['price']
X_train_boost, X_test_boost, y_train_boost, y_test_boost = train_test_split(x_boost, y_boost, test_size=0.3, random_state=42)

x_rf = df_rf.drop(columns=['price'])
y_rf = df_rf['price']
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(x_rf, y_rf, test_size=0.3, random_state=42)

x_cat = df_cat.drop(columns=['price'])
y_cat = df_cat['price']
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(x_cat, y_cat, test_size=0.3, random_state=42)

In [19]:
models = []

# Define default models for each algorithm
xgb_model_def = xgb.XGBRegressor()
lgb_model_def = lgb.LGBMRegressor()
catboost_model_def = CatBoostRegressor()
catboost_model_custom = CatBoostRegressor(cat_features=['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'])
lg = LinearRegression()
rf = RandomForestRegressor()

# Define models for stacking
stacking_models = [('XGBoost', xgb_model_def),
                   ('LightGBM', lgb_model_def),
                   ('CatBoost', catboost_model_def),
                   ('CatBoost_Custom', catboost_model_custom),
                   ('LinearRegression', lg),
                   ('RandomForest', rf)]  # Add SVR to stacking_models

# Extend models list with default models
models.extend([
    ('XGBoost', xgb_model_def),
    ('LightGBM', lgb_model_def),
    ('CatBoost', catboost_model_def),
    ('CatBoost_Custom', catboost_model_custom),
    ('LinearRegression', lg),
    ('RandomForest', rf),
])

In [20]:
def train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test):
    """
    Train and evaluate the given model on the training and testing data.

    Parameters:
    model_name (str): Name of the model for display purposes.
    model : Machine learning model object.
    X_train : Features of the training data.
    y_train : Target labels of the training data.
    X_test : Features of the testing data.
    y_test : Target labels of the testing data.

    Returns:
    float: R-squared score calculated from the model's predictions.
    """

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict labels on the testing data
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Print model performance metrics
    print(f'Model Performance for {model_name}')
    print('Mean Absolute Error:', mae)
    print('Mean Squared Error:', mse)
    print('R-squared:', r2)

    return r2

In [21]:
r2_df = pd.DataFrame(columns=['Model', 'R2'])

# Iterate through each model in the list of models
for model_name, model in models:
    # Train and evaluate the model, and calculate the R-squared score
    if model_name == 'CatBoost_Custom':
        r2_score_value = train_and_evaluate_model(model_name, model, X_train_cat, y_train_cat, X_test_cat, y_test_cat)
    elif model_name in ['LinearRegression']:
        r2_score_value = train_and_evaluate_model(model_name, model, X_train_log, y_train_log, X_test_log, y_test_log)
    elif model_name == 'random_forest':
        r2_score_value = train_and_evaluate_model(model_name, model, X_train_rf, y_train_rf, X_test_rf, y_test_rf)
    else:
        r2_score_value = train_and_evaluate_model(model_name, model, X_train_boost, y_train_boost, X_test_boost, y_test_boost)
        
    # Add model name and R2 score to the DataFrame
    if r2_score_value is not None:
        r2_df = pd.concat([r2_df, pd.DataFrame({'Model': [model_name], 'R2': [r2_score_value]})], ignore_index=True)

# Sort the DataFrame by R2 score in descending order
r2_df_sorted = r2_df.sort_values(by='R2', ascending=False)

Model Performance for XGBoost
Mean Absolute Error: 1010839.625
Mean Squared Error: 2082446308887.1836
R-squared: 0.5164289061594206
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 129
[LightGBM] [Info] Number of data points in the train set: 381, number of used features: 13
[LightGBM] [Info] Start training from score 4762406.272966
Model Performance for LightGBM
Mean Absolute Error: 961022.5848819272
Mean Squared Error: 1602977666834.8694
R-squared: 0.6277677554301107
Learning rate set to 0.035153
0:	learn: 1743325.6816056	total: 1.37ms	remaining: 1.37s
1:	learn: 1714081.0175645	total: 2.01ms	remaining: 1s
2:	learn: 1689225.4357749	total: 3.11ms	remaining: 1.03s
3:	learn: 1663274.0795653	total: 4.27ms	remaining: 1.06s
4:	learn: 1637605.0753578	total: 5.36ms	remaining: 1.07s
5:	learn

In [17]:
r2_df_sorted

Unnamed: 0,Model,R2
4,LinearRegression,0.646335
3,CatBoost_Custom,0.646055
2,CatBoost,0.643488
1,LightGBM,0.627768
5,RandomForest,0.582146
0,XGBoost,0.516429
6,SVR,-0.036173
