In [1]:
import pandas as pd
import datetime
import numpy as np
import seaborn as sns
import requests, zipfile, io
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression

  "Since version 1.0, "


In [3]:
#convert the datetime column to number of days since previous order
train_df['Last_order_placed_date'] = pd.to_datetime(train_df['Last_order_placed_date'])

train_df['Days_since_last_order'] = (datetime.datetime.now() - train_df['Last_order_placed_date']).dt.days

In [4]:
#drop columns 
train_df = train_df.drop(columns=['Last_order_placed_date','Customer_ID','No_of_issues_raised'])

In [5]:
#assign categorical features with unique integer values
conversion_names = {
          "Category_of_customers" : {"Active" : 2, "Inactive":0,"Passive":1},
          "Premium_membership" : {"Yes" : 1, "No":0}
}
train_df.replace(conversion_names,inplace=True)

In [6]:
X = train_df.drop('Discount_percentage',axis=1)
y = train_df['Discount_percentage']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

## Baseline Models

In [8]:
#XgBoost
xgbst_model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [9]:
#HistBooster
histmodel = HistGradientBoostingRegressor(learning_rate=0.09)

In [10]:
#LightGBM
lgbmodel = LGBMRegressor(learning_rate=0.09)

## AdaBoost

Adaboost model does not handle null values internally, so i have manually handled null values in columns, by replacing with median values.

In [11]:
m1,m2,m3 = np.nanmedian(X['No_of_orders_placed']),np.nanmedian(X['Coupons_offered']),np.nanmedian(X['Coupon_consumption_status'])

In [14]:
X_train['No_of_orders_placed'] = X_train['No_of_orders_placed'].fillna(m1)

X_train['Coupons_offered'] = X_train['Coupons_offered'].fillna(m2)

X_train['Coupon_consumption_status'] = X_train['Coupon_consumption_status'].fillna(m3)

For the test set, we have to replace the median values we used for train set. This is done to ensure, that the distribution remains the same

In [12]:
X_test['No_of_orders_placed'] = X_test['No_of_orders_placed'].fillna(m1)

X_test['Coupons_offered'] = X_test['Coupons_offered'].fillna(m2)

X_test['Coupon_consumption_status'] = X_test['Coupon_consumption_status'].fillna(m3)

### AdaBoost over XgBoost

In [15]:
#Build the model
adabst_xgbst = AdaBoostRegressor(base_estimator=xgbst_model,loss='square',n_estimators=70,learning_rate=0.09)

In [16]:
#train the model
adabst_xgbst.fit(X_train,y_train)

AdaBoostRegressor(base_estimator=XGBRegressor(max_depth=5,
                                              objective='reg:squarederror'),
                  learning_rate=0.09, loss='square', n_estimators=70)

In [17]:
#Predict the output on both test and train set

yhat_adabst_xgbst = adabst_xgbst.predict(X_test)

y_adabsttrain_xgbst = adabst_xgbst.predict(X_train)

In [18]:
#Predict the r2Score for both test & train
print("The score on the test set is:",r2_score(y_test,yhat_adabst_xgbst)*100)

print("The score on the train set is:",r2_score(y_train,y_adabsttrain_xgbst)*100)

The score on the test set is: 53.32382779966927
The score on the train set is: 57.758718282483336


## AdaBoost over HistBooster

In [19]:
#Build the model
adabst_histbst = AdaBoostRegressor(base_estimator=histmodel,loss='square',n_estimators=60,learning_rate=0.1)

In [20]:
#train the model
adabst_histbst.fit(X_train,y_train)

AdaBoostRegressor(base_estimator=HistGradientBoostingRegressor(learning_rate=0.09),
                  learning_rate=0.1, loss='square', n_estimators=60)

In [21]:
#Predict the output on both test and train set

yhat_adabst_histbst = adabst_histbst.predict(X_test)

y_adabsttrain_histbst = adabst_histbst.predict(X_train)

In [22]:
#Predict the r2Score for both test & train
print("The score on the test set is:",r2_score(y_test,yhat_adabst_histbst)*100)

print("The score on the train set is:",r2_score(y_train,y_adabsttrain_histbst)*100)

The score on the test set is: 53.43940073096925
The score on the train set is: 59.44264178081149


### AdaBoost over LightGBM

In [23]:
#Build the model
adabst_lgbbst = AdaBoostRegressor(base_estimator=lgbmodel,loss='square',n_estimators=60,learning_rate=0.1)

In [24]:
#train the model
adabst_lgbbst.fit(X_train,y_train)

AdaBoostRegressor(base_estimator=LGBMRegressor(learning_rate=0.09),
                  learning_rate=0.1, loss='square', n_estimators=60)

In [25]:
#Predict the output on both test and train set

yhat_adabst_lgbbst = adabst_lgbbst.predict(X_test)

y_adabsttrain_lgbbst = adabst_lgbbst.predict(X_train)

In [26]:
#Predict the r2Score for both test & train
print("The score on the test set is:",r2_score(y_test,yhat_adabst_lgbbst)*100)

print("The score on the train set is:",r2_score(y_train,y_adabsttrain_lgbbst)*100)

The score on the test set is: 53.46232430985383
The score on the train set is: 59.54197315738573


## AdaBoost with Stacking Regressor

In [28]:
level0 = list()

level0.append((
    'adabst_xgbst',
    AdaBoostRegressor(base_estimator=xgbst_model,loss='square',n_estimators=70,learning_rate=0.09)
    ))

level0.append((
    'adabst_histbst',
    AdaBoostRegressor(base_estimator=histmodel,loss='square',n_estimators=60,learning_rate=0.1)
    ))

level0.append((
    'adabst_lgbbst',
    AdaBoostRegressor(base_estimator=lgbmodel,loss='square',n_estimators=60,learning_rate=0.1)
    ))

level1 = LinearRegression()

In [29]:
#Build the model
meta_model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)

In [30]:
#train the model
meta_model.fit(X_train,y_train)

StackingRegressor(cv=5,
                  estimators=[('adabst_xgbst',
                               AdaBoostRegressor(base_estimator=XGBRegressor(max_depth=5,
                                                                             objective='reg:squarederror'),
                                                 learning_rate=0.09,
                                                 loss='square',
                                                 n_estimators=70)),
                              ('adabst_histbst',
                               AdaBoostRegressor(base_estimator=HistGradientBoostingRegressor(learning_rate=0.09),
                                                 learning_rate=0.1,
                                                 loss='square',
                                                 n_estimators=60)),
                              ('adabst_lgbbst',
                               AdaBoostRegressor(base_estimator=LGBMRegressor(learning_rate=0.09),
                   

In [31]:
#Predict the output on both test and train set

y_pred = meta_model.predict(X_test)

ylrtrain_pred = meta_model.predict(X_train)

In [32]:
#Predict the final r2Score for both test & train
print("The score on the test set is:",r2_score(y_test,y_pred)*100)

print("The score on the train set is:",r2_score(y_train,ylrtrain_pred)*100)

The score on the test set is: 53.48150362663204
The score on the train set is: 59.331059275530286
