In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import pickle

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df=pd.read_csv('/content/drive/MyDrive/Infosys Internship/sales_final.csv')
df.head()

Unnamed: 0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,...,m_65_rolling_mean,m_65_rolling_sum,m_66_rolling_mean,m_66_rolling_sum,m_67_rolling_mean,m_67_rolling_sum,m_68_rolling_mean,m_68_rolling_sum,m_69_rolling_mean,m_69_rolling_sum
0,154,141,113,164,174,132,108,0,0,24,...,78.0,78.0,85.0,85.0,86.0,86.0,87.0,87.0,103.0,103.0
1,54,53,65,64,66,54,25,0,0,5,...,73.0,146.0,64.5,129.0,62.5,125.0,60.5,121.0,68.5,137.0
2,82,86,85,85,72,75,39,0,0,7,...,53.666667,161.0,52.666667,158.0,48.0,144.0,50.666667,152.0,58.666667,176.0
3,54,56,57,46,46,47,47,45,35,38,...,37.0,111.0,39.0,117.0,38.333333,115.0,38.0,114.0,44.0,132.0
4,12,25,17,21,29,23,30,29,31,28,...,18.0,54.0,29.0,87.0,29.666667,89.0,32.666667,98.0,40.0,120.0


In [None]:
bool_cols = ['Foods', 'Hobbies', 'Household', 'California', 'Texas', 'Wisconsin']
feature_cols = [f'm_{i}' for i in range(1, 70)] + \
                [f'm_{i}_lag1' for i in range(1, 70)] + \
                [f'm_{i}_lag2' for i in range(1, 70)] + \
                [f'm_{i}_lag3' for i in range(1, 70)] + \
                ['month'] + bool_cols

In [None]:
X_67 = df.drop(columns=['m_67', 'm_68', 'm_69',
                        'm_68_lag1', 'm_68_lag2', 'm_68_lag3',
                        'm_69_lag1', 'm_69_lag2', 'm_69_lag3',
                        'total_sales'] +
                       [f'm_{i}_rolling_mean' for i in range(1, 70)] +
                       [f'm_{i}_rolling_sum' for i in range(1, 70)])
y_67 = df['m_67']

In [None]:
with open('/content/drive/MyDrive/Infosys Internship/optimal_features_67.pkl', 'rb') as f:
    features_67_XGBoost = pickle.load(f)
with open('/content/drive/MyDrive/Infosys Internship/LGBM/optimal_features_67_lgbm.pkl', 'rb') as f:
    features_67_LGBM = pickle.load(f)

In [None]:
X_train_67, X_test_67, y_train_67, y_test_67 = train_test_split(X_67, y_67, test_size=0.2, shuffle=False)

In [None]:
train_data = lgb.Dataset(X_train_67[features_67_LGBM], label=y_train_67)
test_data = lgb.Dataset(X_test_67[features_67_LGBM], label=y_test_67, reference=train_data)

params = {
    'objective': 'regression',  # Regression task
    'metric': 'l2',             # MSE (mean squared error) is the default metric
    'boosting_type': 'gbdt',    # Gradient Boosting Decision Tree
    'num_leaves': 31,           # Maximum number of leaves in one tree
    'learning_rate': 0.05,      # Learning rate
    'feature_fraction': 0.9     # Fraction of features to use for each tree
}
rounds = 100
model_67_LGBM = lgb.train(params, train_data, rounds, valid_sets=[test_data])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020582 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 44641
[LightGBM] [Info] Number of data points in the train set: 7317, number of used features: 178
[LightGBM] [Info] Start training from score 146.489135


In [None]:
model_67_XGBoost = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    reg_lambda=1.0,  # L2 regularization
    reg_alpha=0.0      # L1 regularization
    )
model_67_XGBoost.fit(X_train_67[features_67_XGBoost], y_train_67)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
scaler = MinMaxScaler()
X_scaled_67 = scaler.fit_transform(X_67)
X_train, X_test, y_train, y_test = train_test_split(X_scaled_67, y, test_size=0.2, shuffle=False)

model = RandomForestRegressor(
    n_estimators=100,          # Number of trees
    max_depth=None,                # Maximum depth of each tree
    min_samples_split=2, # Minimum samples to split an internal node
    min_samples_leaf=1,   # Minimum samples at each leaf node
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

In [None]:
predictions_XGBoost = model_67_XGBoost.predict(X_test_67[features_67_XGBoost])
predictions_LGBM = model_67_LGBM.predict(X_test_67[features_67_LGBM], num_iteration=model_67_LGBM.best_iteration)

In [None]:
mae_67_XGBoost = mean_absolute_error(y_test_67, predictions_XGBoost)
mse_67_XGBoost = mean_squared_error(y_test_67, predictions_XGBoost)
print(f'MAE (XGBoost) for m_67: {mae_67_XGBoost}')
print(f'MSE (XGBoost) for m_67: {mse_67_XGBoost}')
mae_67_LGBM = mean_absolute_error(y_test_67, predictions_LGBM)
mse_67_LGBM = mean_squared_error(y_test_67, predictions_LGBM)
print(f'MAE (LGBM) for m_67: {mae_67_LGBM}')
print(f'MSE (LGBM) for m_67: {mse_67_LGBM}')

MAE (XGBoost) for m_67: 11.176186027780908
MSE (XGBoost) for m_67: 551.9560368900999
MAE (LGBM) for m_67: 11.712941678743299
MSE (LGBM) for m_67: 594.9046356173657


In [None]:
stacked_predictions = np.column_stack((predictions_XGBoost, predictions_LGBM))

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
meta_model = LinearRegression()
meta_model.fit(stacked_predictions, y_test_67)

In [None]:
final_predictions = meta_model.predict(stacked_predictions)
print("MAE:",mean_absolute_error(y_test_67, final_predictions))
print("MSE:",mean_squared_error(y_test_67, final_predictions))

MAE: 11.3032166667347
MSE: 520.7765885157097


In [None]:
meta_model_RF = RandomForestRegressor(n_estimators=100, random_state=42)
meta_model_RF.fit(stacked_predictions, y_test_67)
final_predictions_RF = meta_model_RF.predict(stacked_predictions)
print("MAE:",mean_absolute_error(y_test_67, final_predictions_RF))
print("MSE:",mean_squared_error(y_test_67, final_predictions_RF))

MAE: 4.791816302367941
MSE: 120.236988386445


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
meta_model_GBR = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05)
meta_model_GBR.fit(stacked_predictions, y_test_67)
final_predictions_GBR = meta_model_GBR.predict(stacked_predictions)
print("MAE:",mean_absolute_error(y_test_67, final_predictions_GBR))
print("MSE:",mean_squared_error(y_test_67, final_predictions_GBR))

MAE: 9.54625494663339
MSE: 233.81539714929517


In [None]:
from sklearn.svm import SVR

In [None]:
meta_model_SVR = SVR(kernel='rbf', C=1.0, epsilon=0.1)
meta_model_SVR.fit(stacked_predictions, y_test_67)
final_predictions_SVR = meta_model_SVR.predict(stacked_predictions)
print("MAE:",mean_absolute_error(y_test_67, final_predictions_SVR))
print("MSE:",mean_squared_error(y_test_67, final_predictions_SVR))

MAE: 22.016015854392506
MSE: 7752.123238014582


In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
meta_model_NN = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
meta_model_NN.fit(stacked_predictions, y_test_67)
final_predictions_NN = meta_model_NN.predict(stacked_predictions)
print("MAE:",mean_absolute_error(y_test_67, final_predictions_NN))
print("MSE:",mean_squared_error(y_test_67, final_predictions_NN))

MAE: 11.229551579125602
MSE: 537.4897297664694


In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
meta_model_KNN = KNeighborsRegressor(n_neighbors=5)
meta_model_KNN.fit(stacked_predictions, y_test_67)
final_predictions_KNN = meta_model_KNN.predict(stacked_predictions)
print("MAE:",mean_absolute_error(y_test_67, final_predictions_KNN))
print("MSE:",mean_squared_error(y_test_67, final_predictions_KNN))

MAE: 10.372021857923498
MSE: 548.7318907103826


In [None]:
meta_model_XGBoost = xgb.XGBRegressor(n_estimators=100, learning_rate=0.05)
meta_model_XGBoost.fit(stacked_predictions, y_test_67)
final_predictions_XGBoost = meta_model_XGBoost.predict(stacked_predictions)
print("MAE:",mean_absolute_error(y_test_67, final_predictions_XGBoost))
print("MSE:",mean_squared_error(y_test_67, final_predictions_XGBoost))

MAE: 10.008990311752903
MSE: 390.1432514529642
