In [1]:
import pandas as pd
import datetime
import numpy as np
import seaborn as sns
import requests, zipfile, io
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

  "Since version 1.0, "


In [3]:
#convert the datetime column to number of days since previous order
train_df['Last_order_placed_date'] = pd.to_datetime(train_df['Last_order_placed_date'])

train_df['Days_since_last_order'] = (datetime.datetime.now() - train_df['Last_order_placed_date']).dt.days

In [4]:
#drop columns 
train_df = train_df.drop(columns=['Last_order_placed_date','Customer_ID','No_of_issues_raised'])

In [5]:
conversion_names = {
          "Category_of_customers" : {"Active" : 2, "Inactive":0,"Passive":1},
          "Premium_membership" : {"Yes" : 1, "No":0}
}
train_df.replace(conversion_names,inplace=True)

In [6]:
X = train_df.drop('Discount_percentage',axis=1)
y = train_df['Discount_percentage']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

## Custom Loss

In the below set of codes, i write custom loss function for XGBoost and LightGBM models. I apply weighted loss function by dividing the data into bins based on target variable.

In [8]:
def calcweights(dtrain):
    weights = np.zeros((len(dtrain),1))
    bins = [10,20,30,40,50,60]
    arr,bins = np.histogram(dtrain,bins=bins)
    weight_bins = max(arr)/arr
    j=0
    for i in dtrain:
        if (10 <= i < 20):
            weights[j] = weight_bins[0]
        elif (20 <= i < 30):
            weights[j] = weight_bins[1]
        elif (30 <= i < 40):
            weights[j] = weight_bins[2]
        elif (40 <= i < 50):
            weights[j] = weight_bins[3]
        elif (50 <= i < 60):
            weights[j] = weight_bins[4]
        j = j + 1
    weights = weights.reshape(len(weights),)
    return weights

In [9]:
def squared_err(dtrain,predt):
    out = dtrain
    weights = calcweights(dtrain)
    grad = 2*weights*(predt - out)
    hess = 2*weights
    return grad, hess

### XGBoost

In [11]:
xgbst_model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=8, min_child_weight=1,objective=squared_err, missing=None, n_estimators=160,
             n_jobs=1, nthread=None, random_state=0,
             reg_alpha=0, reg_lambda=1.1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [12]:
#train the model on the train set
xgbst_model.fit(X_train,y_train)



XGBRegressor(max_depth=8, n_estimators=160,
             objective=<function squared_err at 0x7fdd797619e0>,
             reg_lambda=1.1)

In [13]:
#Predict the output on both test & train set

yhat_xgbstval = xgbst_model.predict(X_test)

yhat_xgbsttrain = xgbst_model.predict(X_train)

In [14]:
#Predict the r2Score for both test & train
print("The score on the test set is:",r2_score(y_test,yhat_xgbstval)*100)

print("The score on the train set is:",r2_score(y_train,yhat_xgbsttrain)*100)

The score on the test set is: 45.558302006494046
The score on the train set is: 62.452582019181314


### LightGBM

In [15]:
lgbmodel = LGBMRegressor(learning_rate=0.1,objective=squared_err,max_depth=12,n_estimators=600)

In [16]:
#Train the model on the train set
lgbmodel.fit(X_train,y_train)

LGBMRegressor(max_depth=12, n_estimators=600,
              objective=<function squared_err at 0x7fdd797619e0>)

In [17]:
#Predict the output on both test and train set

ylgb_val = lgbmodel.predict(X_test)

ylgb_train = lgbmodel.predict(X_train)

In [18]:
#Predict the r2Score for both test & train
print("The score on the test set is:",r2_score(y_test,ylgb_val)*100)

print("The score on the train set is:",r2_score(y_train,ylgb_train)*100)

The score on the test set is: 46.051305620513986
The score on the train set is: 62.056933549551


## Stacking Regressor

In addition to previous method(See StackingRegressor.ipynb file), i have included two more models defined above in first level

In [19]:
#Add the models used in first & second level
level0 = list()
level0.append((
    'xgbst',
    XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)
    ))
level0.append((
    'histbst',
    HistGradientBoostingRegressor(learning_rate=0.09)
    ))
level0.append((
    'lgbbst',
    LGBMRegressor(learning_rate=0.09)
    ))
level0.append((
    'lgbweighted',
    LGBMRegressor(learning_rate=0.1,objective=squared_err,max_depth=12,n_estimators=600)
))
level0.append((
    'xgbstweighted',
    XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=8, min_child_weight=1,objective=squared_err, missing=None, n_estimators=160,
             n_jobs=1, nthread=None, random_state=0,
             reg_alpha=0, reg_lambda=1.1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)
))

level1 = LGBMRegressor()

In [20]:
#Build the stacking regressor model
meta_model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)

In [21]:
#Train the model
meta_model.fit(X_train,y_train)



StackingRegressor(cv=5,
                  estimators=[('xgbst',
                               XGBRegressor(max_depth=6,
                                            objective='reg:squarederror')),
                              ('histbst',
                               HistGradientBoostingRegressor(learning_rate=0.09)),
                              ('lgbbst', LGBMRegressor(learning_rate=0.09)),
                              ('lgbweighted',
                               LGBMRegressor(max_depth=12, n_estimators=600,
                                             objective=<function squared_err at 0x7fdd797619e0>)),
                              ('xgbstweighted',
                               XGBRegressor(max_depth=8, n_estimators=160,
                                            objective=<function squared_err at 0x7fdd797619e0>,
                                            reg_lambda=1.1))],
                  final_estimator=LGBMRegressor())

In [22]:
#Predict the target on validation & train set
y_pred = meta_model.predict(X_test)

ylrtrain_pred = meta_model.predict(X_train)

In [23]:
#Predict the r2Score for both test/validation & train set
print("The score on the test set is:",r2_score(y_test,y_pred)*100)

print("The score on the train set is:",r2_score(y_train,ylrtrain_pred)*100)

The score on the test set is: 53.21930585159122
The score on the train set is: 55.677070395575804
