# Gradient boosting

## Load libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import warnings 
warnings.filterwarnings("ignore")
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":10,"axes.titlesize":24,"axes.labelsize":24}) 
from sklearn import tree
from sklearn import linear_model
from sklearn import neighbors
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from scipy import stats
from sklearn.metrics import mean_squared_error,r2_score

## Load data

In [2]:
X_train_complete = pd.read_csv("TrainNormalized.csv")
X_test = pd.read_csv("TestNormalized.csv")
#X_PCA = pd.read_csv("TrainClean10PCA.csv")
#X_train_complete.describe()

In [3]:
#X_train_complete.columns

## Regression on NumberOfCustomers

In [4]:
y=X_train_complete['NumberOfCustomers'].values.reshape(-1,1)
x=X_train_complete.drop(columns=[X_train_complete.columns[0],'NumberOfCustomers','NumberOfSales'],axis=1)

## Lasso feature selection

In [None]:
lasso = linear_model.LassoCV(cv=10)
lasso_fit=lasso.fit(x,y)
yp_lasso = lasso.predict(x)
r2_simple_train = r2_score(y,yp_lasso)
rmse_simple_train = mean_squared_error(y,yp_lasso,multioutput='raw_values')
print("Linear Regression with Lasso")
print("==================================================")
print("\t                  Train R2=%.3f"%(r2_simple_train))
print("\t                  Train RMSE=%.3f"%(rmse_simple_train))
print("\nLinear Regression with Lasso - Chosen Features")
selected=[]
threshold=0.5
for i,c in enumerate(np.append(lasso_fit.intercept_,lasso_fit.coef_)):
    if abs(c)>threshold:
        selected.append(x.columns[i-1])
    #print("%d\t%3.3f" % (i,c) )
x=x[selected]
x.columns

## Try different model complexities

In [None]:
perf=[]
cross_tot=[]
n=10
for i in np.arange(n)+1:
    
    #clf = tree.DecisionTreeRegressor(max_depth=i)
    
    #clf = linear_model.LinearRegression()
    #poly=PolynomialFeatures(x)
    #poly.fit_transform(x)
    
    clf=neighbors.KNeighborsRegressor(i)
    
    clf = clf.fit(x, y)
    #clf = clf.fit(xp, y)
    yp = clf.predict(x).reshape(-1,1)
    #yp = clf.predict(xp).reshape(-1,1)
    
    cross=cross_val_score(clf,x,y).mean() # This is a 3 folds cross-val. As the dataset is big, it's enough!
    #cross=cross_val_score(clf,xp,y).mean()
    cross_tot.append(cross)
    perf.append((i,mean_squared_error(y,yp),r2_score(y,yp),cross))
print("Complexity , RMSE , R2 , crossval_score")
perf

In [None]:
plt.plot(list(np.arange(n)+1),cross_tot,)

## Gradient Boosting

Step 1 generating the first model

In [5]:
perf=[]
model_list=[] # lol
prediction = np.zeros((len(y),1))
clf = tree.DecisionTreeRegressor(max_depth=1)
#clf = linear_model.LinearRegression()
#clf=neighbors.KNeighborsRegressor(2)
clf = clf.fit(x, y)
model_list.append(clf)
yp = clf.predict(x).reshape(-1,1)
prediction += yp
y_residual = y-yp
perf.append((mean_squared_error(y,prediction),r2_score(y,prediction)))

Step 2 loop generating models to describe residuals of the previous models

In [6]:
no_boosting_runs = 100
for i in range(no_boosting_runs):
    clf = tree.DecisionTreeRegressor(max_depth=1)
    #clf = linear_model.LinearRegression()
    # suggested no_boosting_runs = 3 for KNN
    #clf=neighbors.KNeighborsRegressor(2)
    clf = clf.fit(x, y_residual)
    model_list.append(clf)
    yp = clf.predict(x).reshape(-1,1)
    # update the prediction
    prediction += yp
    y_residual = y-prediction
    perf.append((mean_squared_error(y,prediction),r2_score(y,prediction)))

In [7]:
print("RMSE , R2")
perf[-20:]

RMSE , R2


[(142.97280237373116, 0.9942805492509996),
 (140.6786196097808, 0.9943723252049557),
 (138.84653663141896, 0.9944456154265143),
 (136.60729421188438, 0.9945351935597044),
 (134.84093760641215, 0.9946058544787215),
 (132.93768274900236, 0.994681991843584),
 (131.0059332476558, 0.9947592691015584),
 (129.43903077770486, 0.9948219511037055),
 (127.67209274552441, 0.9948926352819815),
 (125.82708357264791, 0.9949664426000204),
 (124.34213970629371, 0.9950258459492424),
 (122.76849385419376, 0.9950887977120814),
 (121.18947650547749, 0.9951519643550222),
 (119.73841204436103, 0.995210012400393),
 (118.3844453456086, 0.9952641761694507),
 (116.94281874229989, 0.9953218466649518),
 (115.30599411744551, 0.9953873258167287),
 (113.67087753232315, 0.9954527366404813),
 (112.51968594871911, 0.9954987886409721),
 (111.35044036300076, 0.9955455628695664)]

Define a function to perform model evaluation

In [8]:
def GradientBoosting_eval(models,x):
    prediction = np.zeros((len(x),1))
    for model in models:
        yp = model.predict(x).reshape(-1,1)
        prediction += yp
    return prediction.reshape(-1,1)

result=GradientBoosting_eval(model_list,x)

In [9]:
result

array([[495.6416648 ],
       [623.81216232],
       [656.12823231],
       ...,
       [264.0067668 ],
       [354.13636157],
       [305.76874346]])

In [25]:
model_list

[DecisionTreeRegressor(criterion='mse', max_depth=1, max_features=None,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=1, max_features=None,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=1, max_features=None,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=1,