In [None]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from tabulate import tabulate

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, HistGradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
# Load the dataset
file_path = 'data_modified.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(tabulate(data.head(), headers='keys', tablefmt='rounded_grid', showindex="always"))

In [None]:
# Define features and target
X = data.drop('Income', axis=1)
y = data['Income']

In [None]:
# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [None]:
# Create preprocessing pipelines for numerical and categorical data
num_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines into a single ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

In [None]:
# Fit and transform the entire dataset
X_processed = preprocessor.fit_transform(X)

In [None]:
# Split the data after preprocessing
X_train, X_temp, y_train, y_temp = train_test_split(X_processed, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Display shapes of the datasets
data_shapes = [
    ["Dataset", "Shape"],
    ["X_train", X_train.shape],
    ["y_train", y_train.shape],
    ["X_val", X_val.shape],
    ["y_val", y_val.shape],
    ["X_test", X_test.shape],
    ["y_test", y_test.shape]
]

In [None]:
# Print the table
print(tabulate(data_shapes, headers="firstrow", tablefmt="grid"))

In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'Random Forest': RandomForestRegressor(random_state=6112024),
    'Gradient Boosting': GradientBoostingRegressor(random_state=6112024),
    'Support Vector Regressor': SVR(),
    'XGBoost': XGBRegressor(random_state=6112024),
    'MLP Regressor': MLPRegressor(random_state=6112024),
    'Decision Tree': DecisionTreeRegressor(random_state=6112024),
    'AdaBoost': AdaBoostRegressor(random_state=6112024),
    'Bagging': BaggingRegressor(random_state=6112024),
    'Kernel Ridge': KernelRidge(),
    'Gaussian Process': GaussianProcessRegressor(),
    'KNeighbors': KNeighborsRegressor(),
    'Hist Gradient Boosting': HistGradientBoostingRegressor(random_state=6112024)
}

In [None]:
models

In [None]:
# Define parameter grids for Hyperparameter Tuning
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    },
    'MLP Regressor': {
        'hidden_layer_sizes': [(50, 50), (100, 50), (100, 100)],
        'activation': ['relu', 'tanh'],
        'learning_rate': ['constant', 'adaptive']
    },
    'Decision Tree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0]
    },
    'Bagging': {
        'n_estimators': [10, 50, 100],
        'max_samples': [0.5, 0.75, 1.0],
        'max_features': [0.5, 0.75, 1.0]
    },
    'Kernel Ridge': {
        'alpha': [0.1, 1.0, 10.0],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': [0.01, 0.1, 1.0, None]
    },
    'Gaussian Process': {
        'alpha': [1e-10, 1e-2, 1.0],
        'n_restarts_optimizer': [0, 1, 2]
    },
    'KNeighbors': {
        'n_neighbors': [3, 5, 10],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    'Hist Gradient Boosting': {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_iter': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_leaf': [20, 50, 100]
    }
}

In [None]:
param_grids

In [None]:
# Train models and evaluate on the validation set
metrics = {}

for name, model in models.items():
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    
    y_val_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    mae = mean_absolute_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    
    metrics[name] = {
        'RMSE': rmse,
        'MAE': mae,
        'R^2': r2,
        'Time': end_time - start_time
    }
    
    print(f'{name} RMSE: {rmse}, MAE: {mae}, R^2: {r2}, Time: {end_time - start_time} seconds')


In [None]:
# Prepare data for tabulation
metrics_table = [["Model", "RMSE", "MAE", "R^2", "Time (seconds)"]]
for model_name, metric_values in metrics.items():
    metrics_table.append([
        model_name, 
        f"{metric_values['RMSE']:.3f}", 
        f"{metric_values['MAE']:.3f}", 
        f"{metric_values['R^2']:.3f}", 
        f"{metric_values['Time']:.4f}"
    ])

In [None]:
# Print the table
print(tabulate(metrics_table, headers="firstrow", tablefmt="grid"))

In [None]:
# Convert metrics dictionary to DataFrame for easier processing
metrics_df = pd.DataFrame(metrics).T

# Normalize the metrics using min-max scaling
scaler = MinMaxScaler()
normalized_metrics = scaler.fit_transform(metrics_df[['RMSE', 'MAE', 'R^2']])
normalized_df = pd.DataFrame(normalized_metrics, columns=['RMSE', 'MAE', 'R^2'], index=metrics_df.index)

# Assign weights to each metric
weights = {'RMSE': 0.4, 'MAE': 0.3, 'R^2': 0.3}

# Compute the weighted score for each model
normalized_df['Score'] = (
    weights['RMSE'] * (1 - normalized_df['RMSE']) +  # Lower RMSE is better, hence (1 - RMSE)
    weights['MAE'] * (1 - normalized_df['MAE']) +  # Lower MAE is better, hence (1 - MAE)
    weights['R^2'] * normalized_df['R^2']  # Higher R^2 is better
)

In [None]:
# Find the best model based on the highest score
best_model_name = normalized_df['Score'].idxmax()
best_model_metrics = metrics[best_model_name]
best_model_instance = models[best_model_name]

# Print the best model metrics
best_model_table = [
    ["Metric", "Value"],
    ["Best Model", best_model_name],
    ["RMSE", f"{best_model_metrics['RMSE']:.3f}"],
    ["MAE", f"{best_model_metrics['MAE']:.3f}"],
    ["R^2", f"{best_model_metrics['R^2']:.3f}"],
    ["Time (seconds)", f"{best_model_metrics['Time']:.4f}"]
]
print("\nBest Model Metrics:")
print(tabulate(best_model_table, headers="firstrow", tablefmt="grid"))

In [None]:
# Plot the predictions vs actual values for the best model
y_test_pred = best_model_instance.predict(X_test)

plt.figure(figsize=(12, 8))
plt.scatter(y_test, y_test_pred, alpha=0.6, edgecolors='w', linewidth=0.5, color='royalblue', label='Predictions')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', linewidth=2, label='Perfect Prediction')
plt.xlabel('Actual Values', fontsize=14)
plt.ylabel('Predicted Values', fontsize=14)
plt.title(f'{best_model_name} Predictions vs Actual Values', fontsize=16)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=12)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.show()

In [None]:
def perform_grid_search(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    return best_model, best_params, best_score

In [None]:
# Perform Grid Search CV for selected models
metrics_param = {}
best_models = {}

for model_name in param_grids.keys():
    print(f"\nTuning {model_name}...")
    model = models[model_name]
    param_grid = param_grids[model_name]
    start_time = time.time()
    best_model, best_params, best_score = perform_grid_search(model, param_grid, X_train, y_train)
    end_time = time.time()
    best_models[model_name] = best_model
    y_val_pred = best_model.predict(X_val)
    rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    mae = mean_absolute_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    metrics_param[f'{model_name} (Tuned)'] = {'RMSE': rmse, 'MAE': mae, 'R^2': r2, 'Time': end_time - start_time}
    print(f'Best parameters: {best_params}')
    print(f'Best score: {best_score}')
    print(f'{model_name} (Tuned) RMSE: {rmse}, MAE: {mae}, R^2: {r2}, Time: {end_time - start_time} seconds')


In [None]:
# Prepare data for tabulation
metrics_table_param = [["Model", "RMSE", "MAE", "R^2", "Time (seconds)"]]
for model_name, metric_values in metrics_param.items():
    metrics_table_param.append([
        model_name, 
        f"{metric_values['RMSE']:.2f}", 
        f"{metric_values['MAE']:.2f}", 
        f"{metric_values['R^2']:.6f}", 
        f"{metric_values['Time']:.6f}"
    ])

In [None]:
# Print the table
print(tabulate(metrics_table_param, headers="firstrow", tablefmt="grid"))

In [None]:
# Normalize the metrics for models with hyperparameter tuning
metrics_df_param = pd.DataFrame(metrics_param).T

# Normalize the metrics using MinMax scaling
scaler = MinMaxScaler()
metrics_df_param[['RMSE', 'MAE', 'R^2']] = scaler.fit_transform(metrics_df_param[['RMSE', 'MAE', 'R^2']])

# Assign weights to each metric
weights = {'RMSE': 0.4, 'MAE': 0.3, 'R^2': 0.3}

# Compute a weighted score for each model
metrics_df_param['Score'] = (
    weights['RMSE'] * (1 - metrics_df_param['RMSE']) +  # Lower RMSE is better, hence (1 - RMSE)
    weights['MAE'] * (1 - metrics_df_param['MAE']) +  # Lower MAE is better, hence (1 - MAE)
    weights['R^2'] * metrics_df_param['R^2']  # Higher R^2 is better
)

In [None]:
# Find the best model based on the highest score
best_model_name_param = metrics_df_param['Score'].idxmax()
best_model_metrics_param = metrics_param[best_model_name_param]
best_model_instance_param = best_models[best_model_name_param.split(" (Tuned)")[0]]  # Remove the "(Tuned)" part to get the model name

# Print the best model metrics
best_model_table_param = [
    ["Metric", "Value"],
    ["Best Model", best_model_name_param],
    ["RMSE", f"{best_model_metrics_param['RMSE']:.3f}"],
    ["MAE", f"{best_model_metrics_param['MAE']:.3f}"],
    ["R^2", f"{best_model_metrics_param['R^2']:.6f}"],
    ["Time (seconds)", f"{best_model_metrics_param['Time']:.6f}"]
]
print("\nBest Model Metrics:")
print(tabulate(best_model_table_param, headers="firstrow", tablefmt="grid"))

In [None]:
# Evaluate the best model on the test set
y_test_pred_param = best_model_instance_param.predict(X_test)
rmse_test_param = mean_squared_error(y_test, y_test_pred_param, squared=False)
mae_test_param = mean_absolute_error(y_test, y_test_pred_param)
r2_test_param = r2_score(y_test, y_test_pred_param)

# Print the test set metrics
test_metrics_table_param = [
    ["Metric", "Value"],
    ["Best Model", best_model_name_param],
    ["RMSE", f"{rmse_test_param:.3f}"],
    ["MAE", f"{mae_test_param:.3f}"],
    ["R^2", f"{r2_test_param:.6f}"]
]
print("\nTest Set Metrics:")
print(tabulate(test_metrics_table_param, headers="firstrow", tablefmt="grid"))

In [None]:
# Plot the predictions vs actual values for the best model after hyperparameter tuning
plt.figure(figsize=(12, 8))
plt.scatter(y_test, y_test_pred_param, alpha=0.6, edgecolors='w', linewidth=0.5, label='Predictions')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', linewidth=2, label='Perfect Prediction')
plt.xlabel('Actual Values', fontsize=14)
plt.ylabel('Predicted Values', fontsize=14)
plt.title(f'{best_model_name_param} Predictions vs Actual Values', fontsize=16)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=12)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.show()

In [None]:
# Deep Learning Model
def build_dl_model(input_dim):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [None]:
# Build and train the Deep Learning Model
input_dim = X_train.shape[1]
dl_model = build_dl_model(input_dim)
start_time = time.time()
dl_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val), verbose=2)
end_time = time.time()

In [None]:
# Evaluate the deep learning model
y_val_pred_dl = dl_model.predict(X_val)
rmse_dl = mean_squared_error(y_val, y_val_pred_dl, squared=False)
mae_dl = mean_absolute_error(y_val, y_val_pred_dl)
r2_dl = r2_score(y_val, y_val_pred_dl)
metrics['Deep Learning Model'] = {'RMSE': rmse_dl, 'MAE': mae_dl, 'R^2': r2_dl, 'Time': end_time - start_time}
print(f'Deep Learning Model RMSE: {rmse_dl}, MAE: {mae_dl}, R^2: {r2_dl}, Time: {end_time - start_time} seconds')

In [None]:
# Evaluate the deep learning model on the test set
y_test_pred_dl = dl_model.predict(X_test)
rmse_dl_test = mean_squared_error(y_test, y_test_pred_dl, squared=False)
mae_dl_test = mean_absolute_error(y_test, y_test_pred_dl)
r2_dl_test = r2_score(y_test, y_test_pred_dl)
print(f'Deep Learning Model Test RMSE: {rmse_dl_test}, MAE: {mae_dl_test}, R^2: {r2_dl_test}')

In [None]:
# Prepare comparison metrics for the best base model, best hyperparameter-tuned model, and deep learning model

# Evaluate the best base model on the test set
y_test_pred_base = best_model_instance.predict(X_test)
rmse_test_base = mean_squared_error(y_test, y_test_pred_base, squared=False)
mae_test_base = mean_absolute_error(y_test, y_test_pred_base)
r2_test_base = r2_score(y_test, y_test_pred_base)

# Evaluate the best hyperparameter-tuned model on the test set
y_test_pred_param = best_model_instance_param.predict(X_test)
rmse_test_param = mean_squared_error(y_test, y_test_pred_param, squared=False)
mae_test_param = mean_absolute_error(y_test, y_test_pred_param)
r2_test_param = r2_score(y_test, y_test_pred_param)

# Evaluate the deep learning model on the test set
y_test_pred_dl = dl_model.predict(X_test)
rmse_dl_test = mean_squared_error(y_test, y_test_pred_dl, squared=False)
mae_dl_test = mean_absolute_error(y_test, y_test_pred_dl)
r2_dl_test = r2_score(y_test, y_test_pred_dl)

# Print metrics to debug
print("Best Base Model Time:", metrics[best_model_name]['Time'])
print("Best Tuned Model Time:", metrics_param[best_model_name_param]['Time'])
print("Deep Learning Model Time:", metrics['Deep Learning Model']['Time'])

In [None]:
# Consolidate the metrics into a dataframe for easy manipulation
comparison_metrics = pd.DataFrame({
    'Model': ['Best Base Model (' + best_model_name + ')', 'Best Tuned Model (' + best_model_name_param + ')', 'Deep Learning Model'],
    'RMSE': [rmse_test_base, rmse_test_param, rmse_dl_test],
    'MAE': [mae_test_base, mae_test_param, mae_dl_test],
    'R^2': [r2_test_base, r2_test_param, r2_dl_test],
    'Time': [metrics[best_model_name]['Time'], metrics_param[best_model_name_param]['Time'], metrics['Deep Learning Model']['Time']]
})

# Check the dataframe structure
print(comparison_metrics)

In [None]:
# Normalize the metrics using MinMax scaling
scaler = MinMaxScaler()
normalized_metrics = scaler.fit_transform(comparison_metrics[['RMSE', 'MAE', 'R^2']])
normalized_df = pd.DataFrame(normalized_metrics, columns=['RMSE', 'MAE', 'R^2'], index=comparison_metrics.index)

# Assign weights to each metric
weights = {'RMSE': 0.4, 'MAE': 0.3, 'R^2': 0.3}

# Compute the weighted score for each model
normalized_df['Score'] = (
    weights['RMSE'] * (1 - normalized_df['RMSE']) +  # Lower RMSE is better, hence (1 - RMSE)
    weights['MAE'] * (1 - normalized_df['MAE']) +  # Lower MAE is better, hence (1 - MAE)
    weights['R^2'] * normalized_df['R^2']  # Higher R^2 is better
)

In [None]:
# Add the 'Score' column to the comparison_metrics DataFrame
comparison_metrics['Score'] = normalized_df['Score']

# Find the best model based on the highest score
best_model_idx = comparison_metrics['Score'].idxmax()
best_model_name = comparison_metrics.loc[best_model_idx, 'Model']
best_model_metrics = comparison_metrics.loc[best_model_idx]

# Print the comparison table
print("\nComparison of Best Base Model, Best Tuned Model, and Deep Learning Model:")
print(tabulate(comparison_metrics, headers="keys", tablefmt="grid"))

In [None]:
# Print the best overall model metrics
best_model_table = [
    ["Metric", "Value"],
    ["Best Overall Model", best_model_name],
    ["RMSE", f"{best_model_metrics['RMSE']:.3f}"],
    ["MAE", f"{best_model_metrics['MAE']:.3f}"],
    ["R^2", f"{best_model_metrics['R^2']:.3f}"],
    ["Time (seconds)", f"{best_model_metrics['Time']:.4f}"],
    ["Score", f"{best_model_metrics['Score']:.3f}"]
]
print("\nBest Overall Model Metrics:")
print(tabulate(best_model_table, headers="firstrow", tablefmt="grid"))


In [None]:
# Save the best model (Random Forest in this case)
"""
best_rf_model = best_models['Gradient Boosting']
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_rf_model, file)
"""