In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import pickle
import time

# Load the dataset
file_path = '/mnt/data/data_modified.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Dataset head:")
print(data.head())

# Checking for missing values
missing_values = data.isnull().sum()
print("\nMissing values:")
print(missing_values)

# Fill missing values or drop columns with too many missing values
data = data.dropna()  # For simplicity, we drop rows with missing values

# Handle categorical variables if necessary
categorical_cols = data.select_dtypes(include=['object']).columns

# Display the cleaned dataset
print("\nCleaned dataset head:")
print(data.head())

# Define features and target
X = data.drop('income', axis=1)
y = data['income']

# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# Create preprocessing pipelines for numerical and categorical data
num_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines into a single ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Fit and transform the entire dataset
X_processed = preprocessor.fit_transform(X)

# Convert processed data back to DataFrame for feature selection
X_processed_df = pd.DataFrame(X_processed, columns=preprocessor.get_feature_names_out())

# Feature correlation analysis
correlation_matrix = X_processed_df.corr().abs()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

# Drop features with low correlation to the target (assuming a threshold, e.g., 0.1)
threshold = 0.1
low_correlation_features = [column for column in correlation_matrix.columns if correlation_matrix[column].mean() < threshold]
X_processed_df.drop(columns=low_correlation_features, inplace=True)

# Apply PCA for feature reduction
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_processed_df)

# Split the data after preprocessing and feature selection
X_train, X_temp, y_train, y_temp = train_test_split(X_pca, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR(),
    'XGBoost': XGBRegressor(random_state=42),
    'MLP Regressor': MLPRegressor(random_state=42)
}

# Train models and evaluate on the validation set
metrics = {}
for name, model in models.items():
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    y_val_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    mae = mean_absolute_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    metrics[name] = {'RMSE': rmse, 'MAE': mae, 'R^2': r2, 'Time': end_time - start_time}
    print(f'{name} RMSE: {rmse}, MAE: {mae}, R^2: {r2}, Time: {end_time - start_time} seconds')

# Define parameter grids for hyperparameter tuning
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    },
    'MLP Regressor': {
        'hidden_layer_sizes': [(50, 50), (100, 50), (100, 100)],
        'activation': ['relu', 'tanh'],
        'learning_rate': ['constant', 'adaptive']
    }
}

# Function to perform Grid Search CV
def perform_grid_search(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(estimator=model,
                               param_grid=param_grid,
                               cv=3,
                               scoring='neg_mean_squared_error',
                               n_jobs=-1,
                               verbose=2)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_

# Perform Grid Search CV for selected models
best_models = {}
for model_name in ['Random Forest', 'Gradient Boosting', 'XGBoost', 'MLP Regressor']:
    print(f"\nTuning {model_name}...")
    model = models[model_name]
    param_grid = param_grids[model_name]
    start_time = time.time()
    best_model, best_params, best_score = perform_grid_search(model, param_grid, X_train, y_train)
    end_time = time.time()
    best_models[model_name] = best_model
    y_val_pred = best_model.predict(X_val)
    rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    mae = mean_absolute_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    metrics[f'{model_name} (Tuned)'] = {'RMSE': rmse, 'MAE': mae, 'R^2': r2, 'Time': end_time - start_time}
    print(f'Best parameters: {best_params}')
    print(f'Best score: {best_score}')
    print(f'{model_name} (Tuned) RMSE: {rmse}, MAE: {mae}, R^2: {r2}, Time: {end_time - start_time} seconds')

# Deep Learning Model
def build_dl_model(input_dim):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Build and train the deep learning model
input_dim = X_train.shape[1]
dl_model = build_dl_model(input_dim)
start_time = time.time()
dl_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val), verbose=2)
end_time = time.time()

# Evaluate the deep learning model
y_val_pred_dl = dl_model.predict(X_val)
rmse_dl = mean_squared_error(y_val, y_val_pred_dl, squared=False)
mae_dl = mean_absolute_error(y_val, y_val_pred_dl)
r2_dl = r2_score(y_val, y_val_pred_dl)
metrics['Deep Learning Model'] = {'RMSE': rmse_dl, 'MAE': mae_dl, 'R^2': r2_dl, 'Time': end_time - start_time}
print(f'Deep Learning Model RMSE: {rmse_dl}, MAE: {mae_dl}, R^2: {r2_dl}, Time: {end_time - start_time} seconds')

# Evaluate all models on the test set
for name, model in best_models.items():
    y_test_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_test_pred, squared=False)
    mae = mean_absolute_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)
    print(f'{name} Test RMSE: {rmse}, MAE: {mae}, R^2: {r2}')

# Evaluate the deep learning model on the test set
y_test_pred_dl = dl_model.predict(X_test)
rmse_dl_test = mean_squared_error(y_test, y_test_pred_dl, squared=False)
mae_dl_test = mean_absolute_error(y_test, y_test_pred_dl)
r2_dl_test = r2_score(y_test, y_test_pred_dl)
print(f'Deep Learning Model Test RMSE: {rmse_dl_test}, MAE: {mae_dl_test}, R^2: {r2_dl_test}')

# Save the best model (Random Forest in this case)
best_rf_model = best_models['Random Forest']
with open('/mnt/data/best_model.pkl', 'wb') as file:
    pickle.dump(best_rf_model, file)

# Display the collected metrics
import ace_tools as tools; tools.display_dataframe_to_user(name="Model Performance Metrics", dataframe=pd.DataFrame(metrics).T)
