In [None]:
# ============================
# 1. Import Necessary Libraries
# ============================

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib  # For saving models
from tqdm import tqdm  # For progress bars

# Import LightGBM's LGBMRegressor
try:
    from lightgbm import LGBMRegressor
except ImportError:
    print("LightGBM is not installed. Installing now...")
    import subprocess
    subprocess.check_call(["pip", "install", "lightgbm"])
    from lightgbm import LGBMRegressor

# ============================
# 2. Set Random Seed for Reproducibility
# ============================

def set_seed(seed):
    """
    Sets the random seed for NumPy and Python's random module to ensure reproducibility.
    """
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)  # You can choose any seed value you prefer

# ============================
# 3. Load and Inspect Data
# ============================

# Define the path to your CSV data file
data_path = 'path_to_your_data.csv'  # <-- Replace with your actual file path

# Load the dataset into a Pandas DataFrame
try:
    data_df = pd.read_csv(data_path)
    print("Dataset loaded successfully.\n")
except FileNotFoundError:
    print(f"File not found at the specified path: {data_path}")
    exit()

# Display basic information about the dataset
print("Dataset Information:")
print(data_df.info())

# Display the first five rows of the dataset
print("\nFirst 5 Rows of the Dataset:")
print(data_df.head())

# ============================
# 4. Data Preprocessing
# ============================

# Assume the last column is the target variable (adjust if necessary)
feature_cols = data_df.columns[:-1]
target_col = data_df.columns[-1]

# Extract features and target variable as NumPy arrays
X = data_df[feature_cols].values
y = data_df[target_col].values

# Check for missing values in features and target
if np.isnan(X).any() or np.isnan(y).any():
    print("\nMissing values detected. Performing imputation...")
    # Replace NaNs with the mean of each column
    X = np.nan_to_num(X, nan=np.nanmean(X))
    y = np.nan_to_num(y, nan=np.nanmean(y))
    print("Missing values imputed with column means.")
else:
    print("\nNo missing values detected.")

# ============================
# 5. Define Pipeline and Hyperparameter Grid
# ============================

# Create a pipeline that first scales the data and then applies LGBMRegressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lgbmregressor', LGBMRegressor(random_state=42))
])

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'lgbmregressor__num_leaves': [31, 64, 128, 256],          # Number of leaves in full trees
    'lgbmregressor__max_depth': [10, 20, 30, 40],            # Maximum depth of the tree
    'lgbmregressor__n_estimators': [100, 200, 300],          # Number of boosting iterations
    'lgbmregressor__learning_rate': [0.01, 0.1, 0.5],        # Boosting learning rate
    'lgbmregressor__subsample': [0.6, 0.8, 1.0],             # Fraction of samples to be used for fitting the individual base learners
    'lgbmregressor__colsample_bytree': ['auto', 'sqrt', 'log2'],  # Fraction of features to be used for fitting the individual base learners
    'lgbmregressor__reg_alpha': [0.0, 0.1, 0.5, 1.0],        # L1 regularization term on weights
    'lgbmregressor__reg_lambda': [0.0, 0.1, 0.5, 1.0]        # L2 regularization term on weights
}

# ============================
# 6. Five-Fold Cross-Validation and Grid Search
# ============================

# Initialize K-Fold cross-validation with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline and hyperparameter grid
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='r2',          # Using R² as the evaluation metric
    cv=kf,
    n_jobs=-1,             # Utilize all available CPU cores
    verbose=2              # Verbosity level: 0, 1, or 2
)

print("Starting five-fold cross-validation and grid search...\n")

# Fit GridSearchCV to the data
grid_search.fit(X, y)

print("\nGrid search completed.")

# ============================
# 7. Results Analysis and Saving
# ============================

# Extract all grid search results into a DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)

# Select relevant columns for clarity
selected_columns = [
    'param_lgbmregressor__num_leaves',
    'param_lgbmregressor__max_depth',
    'param_lgbmregressor__n_estimators',
    'param_lgbmregressor__learning_rate',
    'param_lgbmregressor__subsample',
    'param_lgbmregressor__colsample_bytree',
    'param_lgbmregressor__reg_alpha',
    'param_lgbmregressor__reg_lambda',
    'mean_test_score',
    'std_test_score',
    'mean_train_score',
    'std_train_score'
]
results_selected = results_df[selected_columns]

# Rename columns for better readability
results_selected = results_selected.rename(columns={
    'param_lgbmregressor__num_leaves': 'num_leaves',
    'param_lgbmregressor__max_depth': 'max_depth',
    'param_lgbmregressor__n_estimators': 'n_estimators',
    'param_lgbmregressor__learning_rate': 'learning_rate',
    'param_lgbmregressor__subsample': 'subsample',
    'param_lgbmregressor__colsample_bytree': 'colsample_bytree',
    'param_lgbmregressor__reg_alpha': 'reg_alpha',
    'param_lgbmregressor__reg_lambda': 'reg_lambda',
    'mean_test_score': 'mean_val_r2',
    'std_test_score': 'std_val_r2',
    'mean_train_score': 'mean_train_r2',
    'std_train_score': 'std_train_r2'
})

# Sort the results by mean validation R² in descending order and select top 10
top_results = results_selected.sort_values(by='mean_val_r2', ascending=False).head(10)
print("\nTop 10 Hyperparameter Combinations Based on Average Validation R²:")
print(top_results)

# Save all grid search results to an Excel file for further analysis
results_selected.to_excel('LGBM_Model_AllResults.xlsx', index=False)
print("\nAll hyperparameter search results have been saved to 'LGBM_Model_AllResults.xlsx'.")

# ============================
# 8. Extract Best Hyperparameter Combination
# ============================

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"\nBest Hyperparameter Combination:")
print(best_params)

# Get the best validation R² score
best_score = grid_search.best_score_
print(f"Best Validation R² Score: {best_score:.4f}")

# ============================
# 9. Train Final Model with Best Hyperparameters
# ============================

# Initialize the final model with the best hyperparameters
final_pipeline = grid_search.best_estimator_

# Train the final model on the entire dataset
final_pipeline.fit(X, y)

# Make predictions on the entire dataset
y_pred = final_pipeline.predict(X)

# Calculate performance metrics
final_mae = mean_absolute_error(y, y_pred)
final_mse = mean_squared_error(y, y_pred)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y, y_pred)

print("\nFinal Model Performance on the Entire Dataset:")
print(f"MAE: {final_mae:.4f}")
print(f"MSE: {final_mse:.4f}")
print(f"RMSE: {final_rmse:.4f}")
print(f"R²: {final_r2:.4f}")

# ============================
# 10. Save the Best Model and Scaler
# ============================

# Save the pipeline (which includes both scaler and model) using joblib
model_filename = 'LGBM_FinalModel_Pipeline.pkl'
joblib.dump(final_pipeline, model_filename)
print(f"\nFinal model pipeline (including scaler) has been saved as '{model_filename}'.")

# ============================
# 11. Save Best Hyperparameters and Final Results
# ============================

# Create a dictionary with the best hyperparameters and final performance metrics
results_to_save = {
    'Best Params': [best_params],
    'Final MAE': [final_mae],
    'Final MSE': [final_mse],
    'Final RMSE': [final_rmse],
    'Final R²': [final_r2]
}

# Convert the dictionary to a DataFrame
results_summary = pd.DataFrame(results_to_save)

# Save the summary to an Excel file
summary_filename = 'LGBM_Model_BestResults.xlsx'
results_summary.to_excel(summary_filename, index=False)
print(f"\nBest model results have been saved to '{summary_filename}'.")

# ============================
# 12. Conclusion
# ============================

print("\nScript execution completed successfully.")
