In [1]:
import pandas as pd
import datetime
import numpy as np
import seaborn as sns
import requests, zipfile, io
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

  "Since version 1.0, "


In [3]:
#convert the datetime column to number of days since previous order
train_df['Last_order_placed_date'] = pd.to_datetime(train_df['Last_order_placed_date'])

train_df['Days_since_last_order'] = (datetime.datetime.now() - train_df['Last_order_placed_date']).dt.days

In [4]:
#drop columns 
train_df = train_df.drop(columns=['Last_order_placed_date','Customer_ID','No_of_issues_raised'])

In [5]:
conversion_names = {
          "Category_of_customers" : {"Active" : 2, "Inactive":0,"Passive":1},
          "Premium_membership" : {"Yes" : 1, "No":0}
}
train_df.replace(conversion_names,inplace=True)

In [6]:
X = train_df.drop('Discount_percentage',axis=1)
y = train_df['Discount_percentage']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

## XGBoost

In [8]:
xgbst_model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [9]:
#train the model on the train set
xgbst_model.fit(X_train,y_train)

XGBRegressor(max_depth=6, objective='reg:squarederror')

In [10]:
#Predict the output on both test & train set

yhat_xgbstval = xgbst_model.predict(X_test)

yhat_xgbsttrain = xgbst_model.predict(X_train)

In [11]:
#Predict the r2Score for both test & train
print("The score on the test set is:",r2_score(y_test,yhat_xgbstval)*100)

print("The score on the train set is:",r2_score(y_train,yhat_xgbsttrain)*100)

The score on the test set is: 53.317353006183765
The score on the train set is: 58.215680670064465


## HistBooster

In [12]:
#Build the model
histmodel = HistGradientBoostingRegressor(learning_rate=0.09)

In [13]:
#train the model on the train set
histmodel.fit(X_train,y_train)

HistGradientBoostingRegressor(learning_rate=0.09)

In [14]:
#Predict the output on both test & train set

yhist_val = histmodel.predict(X_test)

yhist_train = histmodel.predict(X_train)

In [15]:
#Predict the r2Score for both test & train
print("The score on the test set is:",r2_score(y_test,yhist_val)*100)

print("The score on the train set is:",r2_score(y_train,yhist_train)*100)

The score on the test set is: 53.431551192142216
The score on the train set is: 56.4802279771092


## LightGBM

In [16]:
#Build the model
lgbmodel = LGBMRegressor(learning_rate=0.09)

In [17]:
#Train the model on the train set
lgbmodel.fit(X_train,y_train)

LGBMRegressor(learning_rate=0.09)

In [18]:
#Predict the output on both test and train set

ylgb_val = lgbmodel.predict(X_test)

ylgb_train = lgbmodel.predict(X_train)

In [19]:
#Predict the r2Score for both test & train
print("The score on the test set is:",r2_score(y_test,ylgb_val)*100)

print("The score on the train set is:",r2_score(y_train,ylgb_train)*100)

The score on the test set is: 53.18077648568843
The score on the train set is: 58.02171303133687


## Stacking Regressor

This is a form of model which has two levels of models. The first level can have multiple models trained on the set.
For the model in the second level, the dataset is formed by taking the output of the models in the first level and considering them as features. 

The target variable remains the same for both levels

In [20]:
level0 = list()

#I have taken all 3 models shown above as my first level of models
level0.append((
    'xgbst',
    XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)
    ))

level0.append((
    'histbst',
    HistGradientBoostingRegressor(learning_rate=0.09)
    ))

level0.append((
    'lgbbst',
    LGBMRegressor(learning_rate=0.09)
    ))

#For the second level i have applied Linear Regression
level1 = LinearRegression()

In [21]:
#Build the Stacking Regressor
meta_model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)

In [22]:
#Train the model
meta_model.fit(X_train,y_train)

StackingRegressor(cv=5,
                  estimators=[('xgbst',
                               XGBRegressor(max_depth=6,
                                            objective='reg:squarederror')),
                              ('histbst',
                               HistGradientBoostingRegressor(learning_rate=0.09)),
                              ('lgbbst', LGBMRegressor(learning_rate=0.09))],
                  final_estimator=LinearRegression())

In [23]:
#Predict the final result on validation and train set
y_pred = meta_model.predict(X_test)

ylrtrain_pred = meta_model.predict(X_train)

In [24]:
#Predict the r2Score for both test & train
print("The score on the test set is:",r2_score(y_test,y_pred)*100)

print("The score on the train set is:",r2_score(y_train,ylrtrain_pred)*100)

The score on the test set is: 53.4371118733708
The score on the train set is: 57.7485164525899
