In [None]:
# ============================
# 1. Import Necessary Libraries
# ============================

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib  # For saving models
from tqdm import tqdm  # For progress bars

# ============================
# 2. Set Random Seed
# ============================

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)

# ============================
# 3. Load and Inspect Data
# ============================

# Read the dataset
data_path = 'path_to_your_data.csv'  # Replace with your actual file path
data_df = pd.read_csv(data_path)

# Display basic information about the dataset
print("Dataset Information:")
print(data_df.info())

# Display the first few rows of the dataset
print("\nFirst 5 Rows of the Dataset:")
print(data_df.head())

# ============================
# 4. Data Preprocessing
# ============================

# Assume the last column is the target variable (adjust if necessary)
feature_cols = data_df.columns[:-1]
target_col = data_df.columns[-1]

X = data_df[feature_cols].values
y = data_df[target_col].values

# Check for missing values
if np.isnan(X).any() or np.isnan(y).any():
    print("Missing values detected. Performing imputation...")
    # Simple imputation: Replace NaNs with the mean of each column
    X = np.nan_to_num(X, nan=np.nanmean(X))
    y = np.nan_to_num(y, nan=np.nanmean(y))

# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

print("\nData Preprocessing Completed.")

# ============================
# 5. Define Model and Hyperparameter Search Space
# ============================

# Define the hyperparameter search space
search_space = {
    'model': ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet'],
    'alpha': [0.1, 1.0, 10.0],  # Applicable to Ridge, Lasso, ElasticNet
    'l1_ratio': [0.2, 0.5, 0.8],  # Applicable to ElasticNet
}

# Generate hyperparameter combinations
param_grid = []
for model in search_space['model']:
    if model == 'LinearRegression':
        param_grid.append({'model': [model]})
    elif model in ['Ridge', 'Lasso']:
        param_grid.append({'model': [model], 'alpha': search_space['alpha']})
    elif model == 'ElasticNet':
        param_grid.append({'model': [model], 'alpha': search_space['alpha'], 'l1_ratio': search_space['l1_ratio']})

# ============================
# 6. Five-Fold Cross-Validation and Model Training
# ============================

# Initialize K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize list to store results for each hyperparameter combination
all_results = []

print("Starting five-fold cross-validation and hyperparameter search...\n")

# Iterate over each hyperparameter combination
for params in tqdm(param_grid, desc="Model Combinations"):
    # Initialize metrics storage for current hyperparameter combination
    fold_metrics = {
        'train_mae': [],
        'train_mse': [],
        'train_rmse': [],
        'train_r2': [],
        'val_mae': [],
        'val_mse': [],
        'val_rmse': [],
        'val_r2': []
    }
    
    # Iterate over each fold
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        # Select and initialize the model based on hyperparameters
        if params['model'] == 'LinearRegression':
            model = LinearRegression()
        elif params['model'] == 'Ridge':
            model = Ridge(alpha=params['alpha'])
        elif params['model'] == 'Lasso':
            model = Lasso(alpha=params['alpha'])
        elif params['model'] == 'ElasticNet':
            model = ElasticNet(alpha=params['alpha'], l1_ratio=params['l1_ratio'])
        else:
            raise ValueError(f"Unsupported model: {params['model']}")
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predictions on training set
        y_train_pred = model.predict(X_train)
        train_mae = mean_absolute_error(y_train, y_train_pred)
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_rmse = np.sqrt(train_mse)
        train_r2 = r2_score(y_train, y_train_pred)
        
        # Predictions on validation set
        y_val_pred = model.predict(X_val)
        val_mae = mean_absolute_error(y_val, y_val_pred)
        val_mse = mean_squared_error(y_val, y_val_pred)
        val_rmse = np.sqrt(val_mse)
        val_r2 = r2_score(y_val, y_val_pred)
        
        # Store metrics for the current fold
        fold_metrics['train_mae'].append(train_mae)
        fold_metrics['train_mse'].append(train_mse)
        fold_metrics['train_rmse'].append(train_rmse)
        fold_metrics['train_r2'].append(train_r2)
        fold_metrics['val_mae'].append(val_mae)
        fold_metrics['val_mse'].append(val_mse)
        fold_metrics['val_rmse'].append(val_rmse)
        fold_metrics['val_r2'].append(val_r2)
    
    # Calculate average metrics across all folds for the current hyperparameter combination
    avg_metrics = {metric: np.mean(values) for metric, values in fold_metrics.items()}
    avg_metrics['params'] = params
    all_results.append(avg_metrics)

# ============================
# 7. Results Analysis and Saving
# ============================

# Convert all results to a DataFrame
results_df = pd.DataFrame(all_results)

# Sort the results based on validation R² in descending order and select the top 10
top_results = results_df.sort_values(by='val_r2', ascending=False).head(10)
print("\nTop 10 Hyperparameter Combinations Based on Average Validation R²:")
print(top_results[['params', 'val_r2']])

# Save all results to an Excel file for further analysis
results_df.to_excel('LR_Model_AllResults.xlsx', index=False)
print("\nAll hyperparameter search results have been saved to 'LR_Model_AllResults.xlsx'")

# ============================
# 8. Extract Best Hyperparameter Combination
# ============================

# Extract the hyperparameter combination with the highest validation R²
best_result = results_df.loc[results_df['val_r2'].idxmax()]
best_params = best_result['params']
print(f"\nBest Hyperparameter Combination: {best_params}")

# ============================
# 9. Train Final Model with Best Hyperparameters
# ============================

# Initialize and train the final model using the best hyperparameters on the entire dataset
if best_params['model'] == 'LinearRegression':
    final_model = LinearRegression()
elif best_params['model'] == 'Ridge':
    final_model = Ridge(alpha=best_params['alpha'])
elif best_params['model'] == 'Lasso':
    final_model = Lasso(alpha=best_params['alpha'])
elif best_params['model'] == 'ElasticNet':
    final_model = ElasticNet(alpha=best_params['alpha'], l1_ratio=best_params['l1_ratio'])
else:
    raise ValueError(f"Unsupported model: {best_params['model']}")

# Train the final model on the entire dataset
final_model.fit(X, y)

# Make predictions on the entire dataset
y_pred = final_model.predict(X)

# Calculate performance metrics
final_mae = mean_absolute_error(y, y_pred)
final_mse = mean_squared_error(y, y_pred)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y, y_pred)

print("\nFinal Model Performance on the Entire Dataset:")
print(f"MAE: {final_mae:.4f}")
print(f"MSE: {final_mse:.4f}")
print(f"RMSE: {final_rmse:.4f}")
print(f"R²: {final_r2:.4f}")

# ============================
# 10. Save the Best Model
# ============================

# Save the scaler and the final trained model using joblib
joblib.dump(scaler, 'LR_Scaler.pkl')
joblib.dump(final_model, 'LR_FinalModel.pkl')
print("\nScaler and final model have been saved.")

# ============================
# 11. Save Best Hyperparameters and Final Results
# ============================

# Save the best hyperparameters and final performance metrics to an Excel file
results_to_save = {
    'Best Params': [best_params],
    'Final MAE': [final_mae],
    'Final MSE': [final_mse],
    'Final RMSE': [final_rmse],
    'Final R²': [final_r2]
}

# Convert to DataFrame
results_summary = pd.DataFrame(results_to_save)

# Save to Excel
results_summary.to_excel('LR_Model_BestResults.xlsx', index=False)
print("\nBest model results have been saved to 'LR_Model_BestResults.xlsx'")
