# Lightgbm+XgBoost Model

In [1]:
import pandas as pd
from xgboost import XGBRegressor, callback
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
from tqdm import tqdm
from datetime import datetime
import pytz
import json
import joblib 
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from lightgbm import LGBMRegressor


In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v16.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v16.csv")

In [3]:
# Function to drop columns with more than 40% missing values, except for 'tow' in the submission set
def drop_columns_above_threshold(df, threshold=40, preserve_columns=None):
    if preserve_columns is None:
        preserve_columns = []
    
    missing_percentage = df.isna().mean() * 100
    cols_to_keep = missing_percentage[missing_percentage <= threshold].index.tolist()
    
    # Ensure columns in preserve_columns are kept even if they exceed the threshold
    cols_to_keep.extend([col for col in preserve_columns if col in df.columns])
    
    df = df[cols_to_keep]
    return df

# Applying the function to challenge_set_updated
challenge_set_updated = drop_columns_above_threshold(challenge_set_updated)

# Applying the function to submission_set_updated, keeping 'tow'
submission_set_updated = drop_columns_above_threshold(submission_set_updated, preserve_columns=['tow'])

In [4]:
def clean_data_better(df, threshold=1e10):
    # Replace inf and -inf with NaN using vectorized operations
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Mask values above the threshold with NaN directly using vectorized operations
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df[numeric_columns] = df[numeric_columns].mask(df[numeric_columns].abs() > threshold)
    
    # Fill NaNs using a combined approach - first forward fill, then median
    df[numeric_columns] = df[numeric_columns].fillna(method='ffill').fillna(df.median())
    
    return df

# Applying the improved cleaning function
challenge_set_updated = clean_data_better(challenge_set_updated)
submission_set_updated = clean_data_better(submission_set_updated)

  df[numeric_columns] = df[numeric_columns].fillna(method='ffill').fillna(df.median())
  df[numeric_columns] = df[numeric_columns].fillna(method='ffill').fillna(df.median())


In [5]:
# If necessary change this part to test the model before the training process
df = challenge_set_updated.iloc[:,:]

# Separating features and target variable
X = df.drop('tow', axis=1)
y = df['tow']

n_jobs = os.cpu_count() // 2

In [None]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Split the data into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Define parameters for LightGBM
lgb_params = {
    'subsample': 1.0,
    'reg_lambda': 0.46415888336127775,  # L2 regularization
    'reg_alpha': 0.166810053720005,     # L1 regularization
    'min_child_weight': 4,
    'max_depth': 13,
    'learning_rate': 0.01,
    'colsample_bytree': 0.6,
    'objective': 'regression',
    'random_state': 42,
    'n_estimators': 10000,              # Large number, with early stopping
    'metric': 'rmse',
    'n_jobs': -1
}

# Define parameters for XGBoost
xgb_params = {
    'subsample': 1.0,
    'reg_lambda': 0.46415888336127775,  # L2 regularization
    'reg_alpha': 0.166810053720005,     # L1 regularization
    'min_child_weight': 4,
    'max_depth': 13,
    'learning_rate': 0.01,
    'colsample_bytree': 0.6,
    'objective': 'reg:squarederror',
    'random_state': 42,
    'n_estimators': 10000,              # Large number, with early stopping
    'eval_metric': 'rmse',
    'n_jobs': -1
}

# Initialize the models
lgb_model = LGBMRegressor(**lgb_params, early_stopping_rounds=20)
xgb_model = XGBRegressor(**xgb_params)

# Train both models with early stopping
lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=20, verbose=False)

# Update parameters with the best number of estimators found
lgb_params['n_estimators'] = lgb_model.best_iteration_
xgb_params['n_estimators'] = xgb_model.best_iteration

# Predict with both models
lgb_y_pred = lgb_model.predict(X_test)
xgb_y_pred = xgb_model.predict(X_test)

# Ensemble prediction by averaging
ensemble_pred = (lgb_y_pred + xgb_y_pred) / 2

# Evaluate ensemble model on the test set
ensemble_r2 = r2_score(y_test, ensemble_pred)
ensemble_rmse = np.sqrt(mean_squared_error(y_test, ensemble_pred))

print(f"Ensemble Model Performance - R^2 Score: {ensemble_r2:.4f}, RMSE: {ensemble_rmse:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.200355 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 36673
[LightGBM] [Info] Number of data points in the train set: 236168, number of used features: 522
[LightGBM] [Info] Start training from score 79525.199947
Training until validation scores don't improve for 20 rounds


In [None]:
import numpy as np
import os
import pytz
from datetime import datetime

# Assuming you have already computed ensemble_r2 and ensemble_rmse

# Save R², RMSE, and hyperparameters
results = {
    'R2': float(ensemble_r2),
    'RMSE': float(ensemble_rmse),
    'LightGBM Parameters': {key: (int(value) if isinstance(value, np.integer) else float(value)
                                  if isinstance(value, np.floating) else value)
                            for key, value in lgb_params.items()},
    'XGBoost Parameters': {key: (int(value) if isinstance(value, np.integer) else float(value)
                                  if isinstance(value, np.floating) else value)
                           for key, value in xgb_params.items()}
}

# Set timezone to São Paulo (UTC-3)
saopaulo_tz = pytz.timezone('America/Sao_Paulo')
timestamp = datetime.now(saopaulo_tz).strftime('%Y%m%d_%H%M%S')

# Define logs directory, and create them if they don't exist
logs_dir = 'logs'
os.makedirs(logs_dir, exist_ok=True)

# Define file paths within the respective directories
results_file = os.path.join(logs_dir, f'ensemble_model_results_{timestamp}.txt')

# Save the results to a TXT file
with open(results_file, 'w') as file:
    file.write(f"R2: {results['R2']}\n")
    file.write(f"RMSE: {results['RMSE']}\n")
    file.write("LightGBM Parameters:\n")
    for param, value in results['LightGBM Parameters'].items():
        file.write(f"  {param}: {value}\n")
    file.write("XGBoost Parameters:\n")
    for param, value in results['XGBoost Parameters'].items():
        file.write(f"  {param}: {value}\n")

print(f"Results saved to {results_file}")


In [None]:
# Display evaluation metrics for the ensemble model
print(f"Ensemble Model Performance - R^2 Score: {ensemble_r2:.4f}, RMSE: {ensemble_rmse:.4f}")


In [None]:
# Define models directory and create them if they don't exist
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

# Initialize LightGBM and XGBoost models with final best parameters
lgb_final_model = LGBMRegressor(**lgb_params, objective='regression', random_state=42)
xgb_final_model = XGBRegressor(**xgb_params, objective='reg:squarederror', random_state=42)

# Train LightGBM model on the entire training + validation set data
lgb_final_model.fit(X, y)

# Train XGBoost model on the entire training + validation set data
xgb_final_model.fit(X, y)

print("Final models trained successfully using all available data.")

In [None]:
import joblib
import os
from datetime import datetime
import pytz

# Set timezone to São Paulo (UTC-3)
saopaulo_tz = pytz.timezone('America/Sao_Paulo')
timestamp = datetime.now(saopaulo_tz).strftime('%Y%m%d_%H%M%S')

# Define file paths within the respective directories for both models
lgb_model_file = os.path.join(models_dir, f'lgb_trained_model_{timestamp}.joblib')
xgb_model_file = os.path.join(models_dir, f'xgb_trained_model_{timestamp}.joblib')

# Save the trained LightGBM model to a file in the models folder
joblib.dump(lgb_final_model, lgb_model_file)
print(f"LightGBM Model saved to {lgb_model_file}")

# Save the trained XGBoost model to a file in the models folder
joblib.dump(xgb_final_model, xgb_model_file)
print(f"XGBoost Model saved to {xgb_model_file}")


In [None]:
submission_set_updated.T

In [None]:
# Assuming 'submission_set_updated' is your full dataset including features and target
submission_set_features = submission_set_updated.iloc[:, :-1]  # Exclude the target column for prediction

# Predict with both LightGBM and XGBoost models
lgb_predictions = lgb_final_model.predict(submission_set_features)
xgb_predictions = xgb_final_model.predict(submission_set_features)

# Average the predictions from both models
ensemble_predictions = (lgb_predictions + xgb_predictions) / 2

# Add the ensemble predictions to the submission_set
submission_set_updated['tow'] = ensemble_predictions

submission_set_updated


In [15]:
# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")
submission_set.to_csv(submission_file, index=False)