In [None]:
import pandas as pd

In [None]:
pip install lightgbm

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error


In [None]:
# Load CSV file
df = pd.read_csv('C:\\Users\\17635\\Downloads\\train_data.csv')

df.head()

In [None]:
train_data = pd.read_csv('C:\\Users\\17635\\Downloads\\train_data.csv')
test_data = pd.read_csv('C:\\Users\\17635\\Downloads\\test_data.csv')

In [None]:
X_train = train_data.drop(columns=['call_counts'])
y_train = train_data['call_counts']

# Data Encoding 

In [None]:
import statsmodels.api as sm

# Categorical columns that need to be encoded
categorical_cols = ['acq_method', 'bi_limit_group', 'channel', 'geo_group',  
                    'household_group', 'pay_type_code', 'prdct_sbtyp_grp', 'product_sbtyp']

# One-hot encode the categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Convert any boolean columns to integer (0/1)
df_encoded = df_encoded.astype(float)

# Define the dependent variable 
dependent_variable = 'call_counts'

# Define independent variables by dropping the dependent variable
X = df_encoded.drop(columns=[dependent_variable])

# Add a constant (intercept)
X = sm.add_constant(X)

# Define the dependent variable
y = df_encoded[dependent_variable]

print(X.dtypes)
print(y.dtypes)

# LightGBM Model w/ Grid Search Optimization 

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Load your training and test data from CSV files
train_data_path = 'C:\\Users\\17635\\Downloads\\train_data.csv'
test_data_path = 'C:\\Users\\17635\\Downloads\\test_data.csv'  

# Read the CSV files into pandas dataframes
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Categorical columns that need to be encoded (assuming the same columns for both train and test)
categorical_cols = ['acq_method', 'bi_limit_group', 'channel', 'geo_group',  
                    'household_group', 'pay_type_code', 'prdct_sbtyp_grp', 'product_sbtyp']

# One-hot encode the categorical variables in both train and test data
train_data_encoded = pd.get_dummies(train_data, columns=categorical_cols, drop_first=True)
test_data_encoded = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)

# Ensure both datasets have the same columns after encoding (to prevent issues during model training)
train_data_encoded, test_data_encoded = train_data_encoded.align(test_data_encoded, join='left', axis=1, fill_value=0)

# Convert any boolean columns to integer (0/1) in both train and test data
train_data_encoded = train_data_encoded.astype(float)
test_data_encoded = test_data_encoded.astype(float)



X_train = train_data_encoded.drop(columns=['call_counts'])  # Replace 'target' with the actual target column name
y_train = train_data_encoded['call_counts']  # Replace 'target' with the actual target column name

# Features of the test data (you don't have the target values for the test set)
X_test = test_data_encoded.drop(columns=['call_counts'])  # If there's a target column placeholder in the test data, remove it


# Initialize LightGBM model
model = lgb.LGBMRegressor()

# Fit the model using the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test)

# Optional: Fine-tune using Grid Search if needed
param_grid = {
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [50, 100, 200],
    'num_leaves': [20, 31, 40]
}

# Grid Search
grid_search = GridSearchCV(
    estimator=lgb.LGBMRegressor(),
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error'
)

# Train Grid Search on the training data
grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")

# Train final model with the best parameters (optional if you want to use the tuned parameters)
final_model = lgb.LGBMRegressor(**grid_search.best_params_)
final_model.fit(X_train, y_train)

# Make predictions on the test data with the final model
y_test_final_pred = final_model.predict(X_test)

mae = mean_absolute_error(y_train, final_model.predict(X_train))  # MAE on training data
mse = mean_squared_error(y_train, final_model.predict(X_train))  # Calculate MSE
rmse = np.sqrt(mse)  # RMSE is the square root of MSE
r2 = r2_score(y_train, final_model.predict(X_train))  # R² on training data

# Print the metrics
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R²): {r2}')



# LightGBM w/ Outlier Handling & Grid Search

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


# Load your training and test data from CSV files
train_data_path = 'C:\\Users\\17635\\Downloads\\train_data.csv'
test_data_path = 'C:\\Users\\17635\\Downloads\\test_data.csv'  

# Read the CSV files into pandas dataframes
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Categorical columns that need to be encoded (assuming the same columns for both train and test)
categorical_cols = ['acq_method', 'bi_limit_group', 'channel', 'geo_group',  
                    'household_group', 'pay_type_code', 'prdct_sbtyp_grp', 'product_sbtyp']

# One-hot encode the categorical variables in both train and test data
train_data_encoded = pd.get_dummies(train_data, columns=categorical_cols, drop_first=True)
test_data_encoded = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)

# Ensure both datasets have the same columns after encoding (to prevent issues during model training)
train_data_encoded, test_data_encoded = train_data_encoded.align(test_data_encoded, join='left', axis=1, fill_value=0)

# Step 1: Cap/Clamp Outliers (Apply to both training and test data separately)
# Define the lower and upper caps based on training data quantiles
lower_cap = train_data_encoded.quantile(0.05)  # 5th percentile
upper_cap = train_data_encoded.quantile(0.95)  # 95th percentile

# Apply clamping to the training data
train_data_encoded_clamped = train_data_encoded.clip(lower=lower_cap, upper=upper_cap, axis=1)

# Apply the same clamping to the test data using the training caps
test_data_encoded_clamped = test_data_encoded.clip(lower=lower_cap, upper=upper_cap, axis=1)

# Step 2: Convert any boolean columns to integer (0/1) in both train and test data
train_data_encoded_clamped = train_data_encoded_clamped.astype(float)
test_data_encoded_clamped = test_data_encoded_clamped.astype(float)

# Step 3: Separate features and target from training data
X_train = train_data_encoded_clamped.drop(columns=['call_counts'])  # Replace 'call_counts' with the actual target column name
y_train = train_data_encoded_clamped['call_counts']  # Replace 'call_counts' with the actual target column name

# Features of the test data (you don't have the target values for the test set)
X_test = test_data_encoded_clamped.drop(columns=['call_counts'])  # If there's a target column placeholder in the test data, remove it

# Initialize LightGBM model
model = lgb.LGBMRegressor()

# Fit the model using the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test)

# Optional: Fine-tune using Grid Search if needed
param_grid = {
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [50, 100, 200],
    'num_leaves': [20, 31, 40]
}

# Grid Search
grid_search = GridSearchCV(
    estimator=lgb.LGBMRegressor(),
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error'
)

# Train Grid Search on the training data
grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")

# Train final model with the best parameters (optional if you want to use the tuned parameters)
final_model = lgb.LGBMRegressor(**grid_search.best_params_)
final_model.fit(X_train, y_train)

# Make predictions on the test data with the final model
y_test_final_pred = final_model.predict(X_test)

mae = mean_absolute_error(y_train, final_model.predict(X_train))  # MAE on training data
mse = mean_squared_error(y_train, final_model.predict(X_train))  # Calculate MSE
rmse = np.sqrt(mse)  # RMSE is the square root of MSE
r2 = r2_score(y_train, final_model.predict(X_train))  # R² on training data

# Print the metrics
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R²): {r2}')
