# Lightgbm+XgBoost Model

In [1]:
import pandas as pd
from xgboost import XGBRegressor, callback
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
from tqdm import tqdm
from datetime import datetime
import pytz
import json
import joblib 
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from lightgbm import LGBMRegressor


In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v16.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v16.csv")

In [3]:
# Function to drop columns with more than 40% missing values, except for 'tow' in the submission set
def drop_columns_above_threshold(df, threshold=40, preserve_columns=None):
    if preserve_columns is None:
        preserve_columns = []
    
    missing_percentage = df.isna().mean() * 100
    cols_to_keep = missing_percentage[missing_percentage <= threshold].index.tolist()
    
    # Ensure columns in preserve_columns are kept even if they exceed the threshold
    cols_to_keep.extend([col for col in preserve_columns if col in df.columns])
    
    df = df[cols_to_keep]
    return df

# Applying the function to challenge_set_updated
challenge_set_updated = drop_columns_above_threshold(challenge_set_updated)

# Applying the function to submission_set_updated, keeping 'tow'
submission_set_updated = drop_columns_above_threshold(submission_set_updated, preserve_columns=['tow'])

In [4]:
def clean_data_better(df, threshold=1e10):
    # Replace inf and -inf with NaN using vectorized operations
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Mask values above the threshold with NaN directly using vectorized operations
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df[numeric_columns] = df[numeric_columns].mask(df[numeric_columns].abs() > threshold)
    
    # Fill NaNs using a combined approach - first forward fill, then median
    df[numeric_columns] = df[numeric_columns].fillna(method='ffill').fillna(df.median())
    
    return df

# Applying the improved cleaning function
challenge_set_updated = clean_data_better(challenge_set_updated)
submission_set_updated = clean_data_better(submission_set_updated)

  df[numeric_columns] = df[numeric_columns].fillna(method='ffill').fillna(df.median())
  df[numeric_columns] = df[numeric_columns].fillna(method='ffill').fillna(df.median())


In [5]:
# If necessary change this part to test the model before the training process
df = challenge_set_updated.iloc[:,:]

# Separating features and target variable
X = df.drop('tow', axis=1)
y = df['tow']

n_jobs = os.cpu_count() // 2

In [8]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Split the data into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Define parameters for LightGBM
lgb_params = {
    'subsample': 1.0,
    'reg_lambda': 0.46415888336127775,  # L2 regularization
    'reg_alpha': 0.166810053720005,     # L1 regularization
    'min_child_weight': 4,
    'max_depth': 13,
    'learning_rate': 0.01,
    'colsample_bytree': 0.6,
    'objective': 'regression',
    'random_state': 42,
    'n_estimators': 10000,              # Large number, with early stopping
    'metric': 'rmse',
    'n_jobs': -1
}

# Define parameters for XGBoost
xgb_params = {
    'subsample': 1.0,
    'reg_lambda': 0.46415888336127775,  # L2 regularization
    'reg_alpha': 0.166810053720005,     # L1 regularization
    'min_child_weight': 4,
    'max_depth': 13,
    'learning_rate': 0.01,
    'colsample_bytree': 0.6,
    'objective': 'reg:squarederror',
    'random_state': 42,
    'n_estimators': 10000,              # Large number, with early stopping
    'eval_metric': 'rmse',
    'n_jobs': -1
}

# Initialize the models
lgb_model = LGBMRegressor(**lgb_params, early_stopping_rounds=20)
xgb_model = XGBRegressor(**xgb_params, early_stopping_rounds=20)

# Train both models with early stopping
lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

# Update parameters with the best number of estimators found
lgb_params['n_estimators'] = lgb_model.best_iteration_
xgb_params['n_estimators'] = xgb_model.best_iteration

# Predict with both models
lgb_y_pred = lgb_model.predict(X_test)
xgb_y_pred = xgb_model.predict(X_test)

# Ensemble prediction by averaging
ensemble_pred = (lgb_y_pred + xgb_y_pred) / 2

# Evaluate ensemble model on the test set
ensemble_r2 = r2_score(y_test, ensemble_pred)
ensemble_rmse = np.sqrt(mean_squared_error(y_test, ensemble_pred))

print(f"Ensemble Model Performance - R^2 Score: {ensemble_r2:.4f}, RMSE: {ensemble_rmse:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027762 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 36673
[LightGBM] [Info] Number of data points in the train set: 236168, number of used features: 522
[LightGBM] [Info] Start training from score 79525.199947
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[10000]	valid_0's rmse: 2795.65
Ensemble Model Performance - R^2 Score: 0.9976, RMSE: 2591.6410


In [9]:
import numpy as np
import os
import pytz
from datetime import datetime

# Assuming you have already computed ensemble_r2 and ensemble_rmse

# Save R², RMSE, and hyperparameters
results = {
    'R2': float(ensemble_r2),
    'RMSE': float(ensemble_rmse),
    'LightGBM Parameters': {key: (int(value) if isinstance(value, np.integer) else float(value)
                                  if isinstance(value, np.floating) else value)
                            for key, value in lgb_params.items()},
    'XGBoost Parameters': {key: (int(value) if isinstance(value, np.integer) else float(value)
                                  if isinstance(value, np.floating) else value)
                           for key, value in xgb_params.items()}
}

# Set timezone to São Paulo (UTC-3)
saopaulo_tz = pytz.timezone('America/Sao_Paulo')
timestamp = datetime.now(saopaulo_tz).strftime('%Y%m%d_%H%M%S')

# Define logs directory, and create them if they don't exist
logs_dir = 'logs'
os.makedirs(logs_dir, exist_ok=True)

# Define file paths within the respective directories
results_file = os.path.join(logs_dir, f'ensemble_model_results_{timestamp}.txt')

# Save the results to a TXT file
with open(results_file, 'w') as file:
    file.write(f"R2: {results['R2']}\n")
    file.write(f"RMSE: {results['RMSE']}\n")
    file.write("LightGBM Parameters:\n")
    for param, value in results['LightGBM Parameters'].items():
        file.write(f"  {param}: {value}\n")
    file.write("XGBoost Parameters:\n")
    for param, value in results['XGBoost Parameters'].items():
        file.write(f"  {param}: {value}\n")

print(f"Results saved to {results_file}")


Results saved to logs/ensemble_model_results_20241018_101121.txt


In [10]:
# Display evaluation metrics for the ensemble model
print(f"Ensemble Model Performance - R^2 Score: {ensemble_r2:.4f}, RMSE: {ensemble_rmse:.4f}")


Ensemble Model Performance - R^2 Score: 0.9976, RMSE: 2591.6410


In [18]:
# Define models directory and create them if they don't exist
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

# Remove 'objective' from lgb_params and xgb_params if they exist to avoid duplication
lgb_params.pop('objective', None)
xgb_params.pop('objective', None)

# Initialize models with best parameters and additional arguments
lgb_final_model = LGBMRegressor(**lgb_params, objective='regression')
xgb_final_model = XGBRegressor(**xgb_params, objective='reg:squarederror')

# Train LightGBM model on the entire training + validation set data
lgb_final_model.fit(X, y)  # Ensure X and y are defined as the dataset

# Train XGBoost model on the entire training + validation set data
xgb_final_model.fit(X, y)  # Ensure X and y are defined as the dataset

print("Final models trained successfully using all available data.")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.216271 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 36681
[LightGBM] [Info] Number of data points in the train set: 369013, number of used features: 524
[LightGBM] [Info] Start training from score 79482.257206
Final models trained successfully using all available data.


In [19]:
import joblib
import os
from datetime import datetime
import pytz

# Set timezone to São Paulo (UTC-3)
saopaulo_tz = pytz.timezone('America/Sao_Paulo')
timestamp = datetime.now(saopaulo_tz).strftime('%Y%m%d_%H%M%S')

# Define file paths within the respective directories for both models
lgb_model_file = os.path.join(models_dir, f'lgb_trained_model_{timestamp}.joblib')
xgb_model_file = os.path.join(models_dir, f'xgb_trained_model_{timestamp}.joblib')

# Save the trained LightGBM model to a file in the models folder
joblib.dump(lgb_final_model, lgb_model_file)
print(f"LightGBM Model saved to {lgb_model_file}")

# Save the trained XGBoost model to a file in the models folder
joblib.dump(xgb_final_model, xgb_model_file)
print(f"XGBoost Model saved to {xgb_model_file}")


LightGBM Model saved to models/lgb_trained_model_20241018_111802.joblib
XGBoost Model saved to models/xgb_trained_model_20241018_111802.joblib


In [20]:
submission_set_updated.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,105949,105950,105951,105952,105953,105954,105955,105956,105957,105958
adep_height_1,1063.0,41.0,117.0,220.0,358.0,1688.0,42.0,1684.0,17.0,18402.0,...,193.0,638.0,53.0,12.0,87.0,938.0,25.0,975.0,311.0,311.0
adep_height_10,1288.0,491.0,667.0,595.0,808.0,1863.0,467.0,1684.0,492.0,18002.0,...,143.0,143.0,228.0,312.0,412.0,1063.0,475.0,1225.0,311.0,311.0
adep_height_2,1063.0,66.0,117.0,220.0,408.0,1688.0,67.0,1684.0,67.0,18402.0,...,193.0,713.0,78.0,37.0,87.0,988.0,25.0,975.0,311.0,311.0
adep_height_3,1063.0,141.0,217.0,295.0,458.0,1688.0,117.0,1684.0,117.0,18402.0,...,193.0,763.0,103.0,87.0,87.0,1013.0,100.0,1050.0,311.0,311.0
adep_height_4,1213.0,191.0,217.0,345.0,508.0,1688.0,167.0,1684.0,192.0,18002.0,...,193.0,763.0,128.0,87.0,212.0,1013.0,150.0,1050.0,311.0,311.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ades_geo_cluster_16,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ades_geo_cluster_17,True,False,False,False,True,False,False,False,False,False,...,True,True,False,False,False,False,False,False,False,False
ades_geo_cluster_18,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ades_geo_cluster_19,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [21]:
# Assuming 'submission_set_updated' is your full dataset including features and target
submission_set_features = submission_set_updated.iloc[:, :-1]  # Exclude the target column for prediction

# Predict with both LightGBM and XGBoost models
lgb_predictions = lgb_final_model.predict(submission_set_features)
xgb_predictions = xgb_final_model.predict(submission_set_features)

# Average the predictions from both models
ensemble_predictions = (lgb_predictions + xgb_predictions) / 2

# Add the ensemble predictions to the submission_set
submission_set_updated['tow'] = ensemble_predictions

submission_set_updated


Unnamed: 0,adep_height_1,adep_height_10,adep_height_2,adep_height_3,adep_height_4,adep_height_5,adep_height_6,adep_height_7,adep_height_8,adep_height_9,...,ades_geo_cluster_11,ades_geo_cluster_12,ades_geo_cluster_13,ades_geo_cluster_14,ades_geo_cluster_15,ades_geo_cluster_16,ades_geo_cluster_17,ades_geo_cluster_18,ades_geo_cluster_19,tow
0,1063.0,1288.0,1063.0,1063.0,1213.0,1213.0,1263.0,1288.0,1288.0,1288.0,...,False,False,False,False,False,False,True,False,False,70073.585932
1,41.0,491.0,66.0,141.0,191.0,241.0,291.0,341.0,391.0,416.0,...,False,False,False,False,False,False,False,False,False,212823.431214
2,117.0,667.0,117.0,217.0,217.0,217.0,417.0,492.0,492.0,617.0,...,False,False,True,False,False,False,False,False,False,225450.520853
3,220.0,595.0,220.0,295.0,345.0,395.0,420.0,470.0,520.0,570.0,...,False,False,False,False,False,False,False,False,True,58379.771071
4,358.0,808.0,408.0,458.0,508.0,558.0,608.0,658.0,708.0,758.0,...,False,False,False,False,False,False,True,False,False,63786.883641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105954,938.0,1063.0,988.0,1013.0,1013.0,1063.0,1063.0,1063.0,1063.0,1063.0,...,False,False,False,False,False,False,False,False,False,68863.415528
105955,25.0,475.0,25.0,100.0,150.0,150.0,275.0,325.0,400.0,425.0,...,False,False,False,False,False,False,False,False,False,177657.706982
105956,975.0,1225.0,975.0,1050.0,1050.0,1100.0,1125.0,1175.0,1200.0,1225.0,...,False,False,False,False,False,False,False,False,False,73947.149351
105957,311.0,311.0,311.0,311.0,311.0,311.0,311.0,286.0,286.0,311.0,...,False,False,False,False,False,False,False,False,False,59880.495260


In [22]:
# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")
submission_set.to_csv(submission_file, index=False)