# Part 2: Data Preprocessing
1. Remove the first column of the data in data.csv , because it is just a copy of index.

2. Boss: To implement Collaborative Filtering in recommendation systems, we need a user-item table to show the number of orders for each user and each item. 
Try to construct user-item table. An example of user-item pair: (Phone_No, 标准美式)

3. Boss: Life is not like a Markov Chain, which means everyone's past behavior is correlated with his present one. And that is why we could exploit past purchase behavior to predict their future buying trends. 
Try to construct a dataset to show this past purchasing behavior trend. For convenience, several instructions are proposed as follows

    a.Two days correspond to one dimension.

    b.The last two days of the time span of the data should be the future, which means it corresponds to the target field for the following data mining models.

    c.The length of each user vector must be maximized.

    d.The dataset should be a DataFrame in Pandas, so you could customize the columns as you wish. 
    For example, if the time span is from 2019-02-01 to 2019-02-10, then there are 10 days altogether. So each user corresponds to a 5-dimensional vector, with 4 features and 1 target dimension. The vector [4, 0, 0, 0, 1] means this user bought one good between 02-09 and 02-10, and four goods between 02-01 and 02-02. Additionally, the length of each user vector MUST BE 5 because of the rule 3.

In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']  # 中文字体设置-黑体
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题
sns.set(font='SimHei')
from dateutil import rrule
from datetime import datetime
import time
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn import metrics
from sklearn import svm, datasets
from sklearn.ensemble import AdaBoostClassifier

In [7]:
data=pd.read_csv(r'C:\Users\rcf\Desktop\WISERCLUB\WISERCLUB\data.csv',encoding = "UTF-8")
data.drop('Unnamed: 0',axis=1,inplace=True)
data

Unnamed: 0,Unnamed: 0.1,dt,phone_no,member_id,commodity_code,commodity_name,commodity_origin_money,coupon_id,coupon_money,one_category_name,two_category_name,commodity_income,pay_money,coffeestore_share_money
0,0,2019-01-25,13901387938,14442,SP025,榛果拿铁,27.0,7045.0,19.44,饮品,现磨咖啡,7.56,7.56,0.0
1,1,2019-01-27,13901387938,14442,SP209,NFC鲜榨橙汁,24.0,,0.00,饮品,鲜榨果蔬汁,0.00,0.00,0.0
2,2,2019-01-23,13901387938,14442,SP025,榛果拿铁,27.0,5589.0,22.14,饮品,现磨咖啡,4.86,4.86,0.0
3,3,2019-02-01,13901387938,14442,SP025,榛果拿铁,27.0,6604.0,19.44,饮品,现磨咖啡,7.56,7.56,0.0
4,4,2019-01-27,13901387938,14442,SP010,巧克力瑞纳冰,27.0,6947.0,14.04,饮品,瑞纳冰,12.96,12.96,0.0
5,5,2019-02-03,13901387938,14442,SP023,香草拿铁,27.0,6604.0,19.44,饮品,现磨咖啡,7.56,7.56,0.0
6,6,2019-01-24,13901387938,14442,SP008,拿铁,24.0,7038.0,17.28,饮品,现磨咖啡,6.72,6.72,0.0
7,7,2019-02-02,13901387938,14442,SP209,NFC鲜榨橙汁,24.0,5664.0,14.88,饮品,鲜榨果蔬汁,9.12,9.12,0.0
8,8,2019-02-02,13901387938,14442,SP344,加浓美式,24.0,6604.0,17.28,饮品,现磨咖啡,6.72,6.72,0.0
9,9,2019-01-24,17710179294,16404,SP029,焦糖拿铁,27.0,,0.00,饮品,现磨咖啡,9.00,0.00,9.0


In [8]:
#P2
data11=data.copy()
data11['number of orders']=1
pd.pivot_table(data11,values='number of orders' ,index = 'phone_no', columns = 'commodity_name', aggfunc = np.sum, fill_value = 0)

commodity_name,NFC鲜榨橙汁,NFC鲜榨芒果汁,NFC鲜榨芒果混合果汁,NFC鲜榨荔枝汁,NFC鲜榨蓝莓草莓混合果汁,依云矿泉水330ml,加浓美式,卡布奇诺,卡布奇诺瑞纳冰,土豆烧牛方饭,...,蜜汁叉烧包,金枪鱼谷物沙拉,陈皮普洱,香椰提子麦芬,香草拿铁,香蕉核桃麦芬,鱼香茄子饭,鸡蛋马铃薯双拼三明治,黑森林蛋糕,黑金气泡美式
phone_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
51379898,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57047978,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
61120518,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
62288158,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64618166,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65310185,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
67443044,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67469370,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67617677,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
68557104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#3 ignore the first day, treat the rest of those 40 days as 20 dimensions respectively
data12=pd.pivot_table(data11,values='number of orders' ,index = 'phone_no', columns = 'dt', aggfunc = np.sum, fill_value = 0)
for i in range(1,21):
    j=2*i-1
    k=2*i+1
    i=str(i)
    locals()['arr'+i]=pd.DataFrame(data=data12.iloc[:,j:k],index=data12.index)
    locals()['arr'+i]=pd.DataFrame(np.sum(locals()['arr'+i],axis=1),columns=[i])    
    if int(i)>1:
        locals()['arr'+i]=pd.concat([locals()['arr'+str(int(i)-1)],locals()['arr'+i]],axis=1)
arr20

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
phone_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
51379898,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
57047978,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
61120518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
62288158,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,1,0,0,1
64618166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
65310185,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
67443044,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
67469370,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
67617677,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
68557104,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


# Part 3: Model Training and Prediction
Boss: For the target field, 1 means he purchased in the future and 0 means he did not. Then you could use traditional classification algorithms to predict the future behaviors of all users.

1. Transform the data you got from the last section into an array in Numpy.

2. Split the data into features X and targets Y.

3. Use Adaboost, Random Forest in Sklearn to construct the model for prediction with 3-fold cross validation.

    a.(Optional) Use Xgboost.

    b.Boss: We could, but we do not use Naive Bayes or Support Vector Machine in this project. 
    True of False? Explain.

In [10]:
#1
arr1=arr20.values
arr1

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 5, 0, ..., 0, 0, 3],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int64)

In [11]:
#2
arr2=arr1.copy()
arr2=np.where(arr2>1,1,arr2)
np.random.shuffle(arr2)
X=arr2[:,:-1]
X
Y=arr2[:,-1]
Y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int64)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [12]:
#3
X_train_input1=X[:311256]
Y_train_input1=Y[:311256]
X_test_input1=X[311256:]
Y_test_input1=Y[311256:]
X_train_input2=X[155628:]
Y_train_input2=Y[155628:]
X_test_input2=X[:155628]
Y_test_input2=Y[:155628]
X_train_input3=X[np.r_[:155628,311256:]]
Y_train_input3=Y[np.r_[:155628,311256:]]
X_test_input3=X[155628:311256]
Y_test_input3=Y[155628:311256]

In [13]:
start=time.clock()
AdaBoost_clf1 = AdaBoostClassifier(n_estimators=10)
AdaBoost_clf1.fit(X_train_input1,Y_train_input1)
Y_prediction1 = AdaBoost_clf1.predict(X_test_input1)
AdaBoost_clf2 = AdaBoostClassifier(n_estimators=10)
AdaBoost_clf2.fit(X_train_input2,Y_train_input2)
Y_prediction2 = AdaBoost_clf2.predict(X_test_input2)
AdaBoost_clf3 = AdaBoostClassifier(n_estimators=10)
AdaBoost_clf3.fit(X_train_input3,Y_train_input3)
Y_prediction3= AdaBoost_clf3.predict(X_test_input3)
end=time.clock()

  """Entry point for launching an IPython kernel.


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=10, random_state=None)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=10, random_state=None)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=10, random_state=None)

  # This is added back by InteractiveShellApp.init_path()


In [14]:
S1=AdaBoost_clf1.score(X_train_input1, Y_train_input1)
S2=AdaBoost_clf2.score(X_train_input2, Y_train_input2)
S3=AdaBoost_clf3.score(X_train_input3, Y_train_input3)
S=(S1+S2+S3)/3
print("AdaBoost_train_score:{:.4f}".format(S))
s1=AdaBoost_clf1.score(X_test_input1, Y_test_input1)
s2=AdaBoost_clf2.score(X_test_input2, Y_test_input2)
s3=AdaBoost_clf3.score(X_test_input3, Y_test_input3)
s=(s1+s2+s3)/3
print("AdaBoost_test_score:{:.4f}".format(s))
(metrics.confusion_matrix(Y_test_input1,Y_prediction1)+metrics.confusion_matrix(Y_test_input2,Y_prediction2)+metrics.confusion_matrix(Y_test_input3,Y_prediction3))/3
print('AdaBoost准确率：',(metrics.accuracy_score(Y_test_input1,Y_prediction1)+metrics.accuracy_score(Y_test_input2,Y_prediction2)+metrics.accuracy_score(Y_test_input3,Y_prediction3))/3)
print('AdaBoost宏精度：',(metrics.precision_score(Y_test_input1,Y_prediction1,average = 'macro')+metrics.precision_score(Y_test_input2,Y_prediction2,average = 'macro')+metrics.precision_score(Y_test_input3,Y_prediction3,average = 'macro'))/3) 
print('Adaboost加权平均召回率:',(metrics.recall_score(Y_test_input1,Y_prediction1,average = 'weighted')+metrics.recall_score(Y_test_input2,Y_prediction2,average = 'weighted')+metrics.recall_score(Y_test_input3,Y_prediction3,average = 'weighted'))/3) 
print('Adaboost加权平均F1得分:',(metrics.f1_score(Y_test_input1,Y_prediction1,average = 'weighted')+metrics.f1_score(Y_test_input2,Y_prediction2,average = 'weighted')+metrics.f1_score(Y_test_input3,Y_prediction3,average = 'weighted'))/3)
print('Adaboost程序运行时间为',(end-start))

AdaBoost_train_score:0.8905
AdaBoost_test_score:0.8906


array([[133797.        ,   3021.33333333],
       [ 14009.33333333,   4801.        ]])

AdaBoost准确率： 0.8905685773383086
AdaBoost宏精度： 0.75948462885325
Adaboost加权平均召回率: 0.8905685773383086
Adaboost加权平均F1得分: 0.8701062670365985
Adaboost程序运行时间为 25.452839600000004


In [15]:
start=time.clock()
random_forest1= RandomForestClassifier(n_estimators=100)
random_forest1.fit(X_train_input1, Y_train_input1)
Y_prediction1= random_forest1.predict(X_test_input1)
random_forest2= RandomForestClassifier(n_estimators=100)
random_forest2.fit(X_train_input2, Y_train_input2)
Y_prediction2= random_forest2.predict(X_test_input2)
random_forest3= RandomForestClassifier(n_estimators=100)
random_forest3.fit(X_train_input3, Y_train_input3)
Y_prediction3= random_forest3.predict(X_test_input3)
end=time.clock()

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

  # This is added back by InteractiveShellApp.init_path()


In [16]:
print('RandomForest_train_score:{:.4f}'.format((random_forest1.score(X_train_input1, Y_train_input1)+random_forest2.score(X_train_input2, Y_train_input2)+random_forest3.score(X_train_input3, Y_train_input3))/3))
print('RandomForest_test_score:{:.4f}'.format((random_forest1.score(X_test_input1, Y_test_input1)+random_forest2.score(X_test_input2, Y_test_input2)+random_forest3.score(X_test_input3, Y_test_input3))/3))
print('RandomForest准确率：',(metrics.accuracy_score(Y_test_input1,Y_prediction1)+metrics.accuracy_score(Y_test_input2,Y_prediction2)+metrics.accuracy_score(Y_test_input3,Y_prediction3))/3)
print('RandomForest宏精度：',(metrics.precision_score(Y_test_input1,Y_prediction1,average = 'macro')+metrics.precision_score(Y_test_input2,Y_prediction2,average = 'macro')+metrics.precision_score(Y_test_input3,Y_prediction3,average = 'macro'))/3) 
print('RandomForest加权平均召回率:',(metrics.recall_score(Y_test_input1,Y_prediction1,average = 'weighted')+metrics.recall_score(Y_test_input2,Y_prediction2,average = 'weighted')+metrics.recall_score(Y_test_input3,Y_prediction3,average = 'weighted'))/3) 
print('RandomForest加权平均F1得分:',(metrics.f1_score(Y_test_input1,Y_prediction1,average = 'weighted')+metrics.f1_score(Y_test_input2,Y_prediction2,average = 'weighted')+metrics.f1_score(Y_test_input3,Y_prediction3,average = 'weighted'))/3)
print('RandomForest程序运行时间为：',(end-start))

RandomForest_train_score:0.9178
RandomForest_test_score:0.8926
RandomForest准确率： 0.8925626771316759
RandomForest宏精度： 0.7638916574705865
RandomForest加权平均召回率: 0.8925626771316759
RandomForest加权平均F1得分: 0.8740735292599461
RandomForest程序运行时间为： 356.7021976


In [17]:
#Xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

num_round = 100
bst =XGBClassifier(max_depth=2, learning_rate=0.1,n_estimators=num_round, silent=True)
bst.fit(X, Y)
kfold = StratifiedKFold(n_splits=3, random_state=7)
results = cross_val_score(bst, X, Y, cv=kfold)
print("Xgboost accuracy:",results.mean())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=1, verbosity=1)



Xgboost accuracy: 0.8914553010700504


In [18]:
#Naive_Bays
from sklearn.naive_bayes import BernoulliNB
start=time.clock()
model=BernoulliNB()
model.fit(X_train_input1, Y_train_input1)
model.fit(X_train_input2, Y_train_input2)
model.fit(X_train_input3, Y_train_input3)
predicted1= model.predict(X_test_input1)
predicted2= model.predict(X_test_input2)
predicted3= model.predict(X_test_input3)
end=time.clock()
print('NaiveBays准确率：',(metrics.accuracy_score(Y_test_input1,predicted1)+metrics.accuracy_score(Y_test_input2,predicted2)+metrics.accuracy_score(Y_test_input3,predicted3))/3)
print('NaiveBays宏精度：',(metrics.precision_score(Y_test_input1,predicted1,average = 'macro')+metrics.precision_score(Y_test_input2,predicted2,average = 'macro')+metrics.precision_score(Y_test_input3,predicted3,average = 'macro'))/3) 
print('NaiveBays加权平均召回率:',(metrics.recall_score(Y_test_input1,predicted1,average = 'weighted')+metrics.recall_score(Y_test_input2,predicted2,average = 'weighted')+metrics.recall_score(Y_test_input3,predicted3,average = 'weighted'))/3) 
print('NaiveBays加权平均F1得分:',(metrics.f1_score(Y_test_input1,predicted1,average = 'weighted')+metrics.f1_score(Y_test_input2,predicted2,average = 'weighted')+metrics.f1_score(Y_test_input3,predicted3,average = 'weighted'))/3)
print('NaiveBays程序运行CPU时间为：',(end-start))

  This is separate from the ipykernel package so we can avoid doing imports until


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

  # This is added back by InteractiveShellApp.init_path()


NaiveBays准确率： 0.866791895088367
NaiveBays宏精度： 0.6873078226901899
NaiveBays加权平均召回率: 0.866791895088367
NaiveBays加权平均F1得分: 0.8671660372499468
NaiveBays程序运行CPU时间为： 3.5030808999999863


This is an imbalanced classification problem.We are going to use SMOTE to solve it.

In [None]:
#SVM
start=time.clock()
clf=svm.SVC()
clf.fit(X_train_input1,Y_train_input1)
predicted1=clf.predict(X_test_input1)
clf.fit(X_train_input2,Y_train_input2)
predicted1=clf.predict(X_test_input2)
clf.fit(X_train_input3,Y_train_input3)
predicted1=clf.predict(X_test_input3)
end=time.clock()
print('SVM准确率：',(metrics.accuracy_score(Y_test_input1,predicted1)+metrics.accuracy_score(Y_test_input2,predicted2)+metrics.accuracy_score(Y_test_input3,predicted3))/3)
print('SVM宏精度：',(metrics.precision_score(Y_test_input1,predicted1,average = 'macro')+metrics.precision_score(Y_test_input2,predicted2,average = 'macro')+metrics.precision_score(Y_test_input3,predicted3,average = 'macro'))/3) 
print('SVM加权平均召回率:',(metrics.recall_score(Y_test_input1,predicted1,average = 'weighted')+metrics.recall_score(Y_test_input2,predicted2,average = 'weighted')+metrics.recall_score(Y_test_input3,predicted3,average = 'weighted'))/3) 
print('SVM加权平均F1得分:',(metrics.f1_score(Y_test_input1,predicted1,average = 'weighted')+metrics.f1_score(Y_test_input2,predicted2,average = 'weighted')+metrics.f1_score(Y_test_input3,predicted3,average = 'weighted'))/3)
print('SVM程序运行CPU时间为：',(end-start))

  


The models trained by adaboost and random forest have higher accuracy scores, precision scores, recall scores and f1 scores than that trained by naive bayes, though naive bayes costs less time to do it.
SVM costs way too much time because of its high complexity in computation, so that I can't even get the output from it within a pretty long time. 

In [4]:
#4
holiday=pd.read_csv(r'C:\Users\rcf\Desktop\WISERCLUB\WISERCLUB\holiday.csv',encoding = "UTF-8").iloc[468:509].copy()
holiday=holiday.rename(index=holiday.loc[:,'dt']).drop(['dt'],axis=1)
holiday=holiday.loc[:,['type']]
holiday=pd.DataFrame(holiday)

In [19]:
j='0'
k='0'
data12=pd.pivot_table(data11,values='number of orders' ,index = 'phone_no', columns = 'dt', aggfunc = np.sum, fill_value = 0)
for i in range(data12.shape[1]):
    if holiday.loc[data12.columns[i]].values>0:
        locals()['weekend'+j]=pd.DataFrame(data=data12.iloc[:,i],index=data12.index)
        if j>'0':
            locals()['weekend'+j]=pd.concat([locals()['weekend'+str(int(j)-1)],locals()['weekend'+j]],axis=1)
        j=str(int(j)+1)     
    else:
        locals()['workday'+k]=pd.DataFrame(data=data12.iloc[:,i],index=data12.index)
        if k>'0':
            locals()['workday'+k]=pd.concat([locals()['workday'+str(int(k)-1)],locals()['workday'+k]],axis=1)
        k=str(int(k)+1)
locals()['weekend'+str(int(j)-1)]
locals()['workday'+str(int(k)-1)]

Unnamed: 0_level_0,2019-01-20,2019-01-26,2019-01-27,2019-02-04,2019-02-05,2019-02-06,2019-02-07,2019-02-08,2019-02-09,2019-02-10,2019-02-16,2019-02-17,2019-02-23,2019-02-24
phone_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
51379898,0,0,0,0,0,0,0,0,0,0,0,0,0,0
57047978,1,0,0,0,0,0,0,0,0,0,0,0,0,0
61120518,1,0,0,0,0,0,0,0,0,0,0,0,0,0
62288158,0,0,0,0,0,0,0,0,0,0,0,0,0,0
64618166,1,0,0,0,0,0,0,0,0,0,0,0,0,0
65310185,0,0,0,0,0,0,0,0,0,0,0,0,0,0
67443044,0,1,0,0,0,0,0,0,0,0,0,0,0,0
67469370,0,0,0,0,0,0,0,0,0,0,0,0,0,0
67617677,0,0,0,0,0,0,0,0,0,0,0,0,0,0
68557104,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,2019-01-21,2019-01-22,2019-01-23,2019-01-24,2019-01-25,2019-01-28,2019-01-29,2019-01-30,2019-01-31,2019-02-01,...,2019-02-18,2019-02-19,2019-02-20,2019-02-21,2019-02-22,2019-02-25,2019-02-26,2019-02-27,2019-02-28,2019-03-01
phone_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
51379898,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57047978,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
61120518,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
62288158,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,1
64618166,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65310185,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
67443044,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67469370,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67617677,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
68557104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
weekend=locals()['weekend'+str(int(j)-1)].values.copy()
workday=locals()['workday'+str(int(k)-1)].values.copy()
weekend=np.where(weekend>1,1,weekend)
workday=np.where(workday>1,1,workday)
X_weekend=weekend[:,:-1]
Y_weekend=weekend[:,-1]
X_workday=workday[:,:-1]
Y_workday=workday[:,-1]
from collections import Counter
Counter(Y_weekend)
Counter(Y_workday)

Counter({0: 450222, 1: 16664})

Counter({0: 433705, 1: 33181})

In [21]:
from imblearn.over_sampling import SMOTE
smo = SMOTE(random_state=42)
X_weekend_smo, Y_weekend_smo = smo.fit_sample(X_weekend, Y_weekend)
X_workday_smo, Y_workday_smo = smo.fit_sample(X_workday, Y_workday)

In [None]:
from imblearn.under_sampling import ClusterCentroids 
cc = ClusterCentroids(random_state=0)
X_weekend_resampled, Y_weekend_resampled = cc.fit_sample(X_weekend, Y_weekend)
Counter(Y_weekend_resampled)

In [None]:
#Xgboost for weekends
bst =XGBClassifier(max_depth=5, learning_rate=0.1,n_estimators=500, silent=True,min_child_weight=1,subsample=0.8,colsample_bytree=0.8)
bst.fit(X_weekend_smo, Y_weekend_smo)
kfold = StratifiedKFold(n_splits=3, random_state=7)
results=cross_val_score(bst, X_weekend_smo, Y_weekend_smo, cv=kfold, scoring='f1_macro')
results.mean()

In [85]:
#Random Forest for weekends
random_forest= RandomForestClassifier(n_estimators=100，min_sample_leaf=60)
random_forest.fit(X_weekend_smo, Y_weekend_smo)
cross_val_score(random_forest, X_weekend_smo, Y_weekend_smo, cv=kfold, scoring='f1_macro').mean()

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

0.6880223072894269

In [29]:
# #Adaboost for weekends
# AdaBoost_clf= AdaBoostClassifier(n_estimators=70,learning_rate=0.1)
# AdaBoost_clf.fit(X_weekend_smo,Y_weekend_smo)
# cross_val_score(AdaBoost_clf, X_weekend_smo, Y_weekend_smo, cv=kfold, scoring='f1_macro').mean()

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.1,
                   n_estimators=70, random_state=None)

0.6816725756718413

In [None]:
#Random Forest for workdays
random_forest= RandomForestClassifier(n_estimators=100,min_samples_split=100,min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10)
random_forest.fit(X_workday_smo, Y_workday_smo)
cross_val_score(random_forest, X_workday_smo, Y_workday_smo, cv=kfold, scoring='f1_macro').mean()

In [None]:
param_test1 = {'n_estimators':range(10,101,10)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100,
                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10), 
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(X_workdays_smo,Y_workdays_smo)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [83]:
# #Xgboost for workdays
# bst.fit(X_workday_smo, Y_workday_smo)
# results=cross_val_score(bst, X_workday_smo, Y_workday_smo, cv=kfold, scoring='f1_macro')
# results.mean()

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=1, verbosity=1)



0.6883781355629304

In [28]:
# #Adaboost for weekends
# AdaBoost_clf= AdaBoostClassifier(n_estimators=50,algorithm='SAMME')
# AdaBoost_clf.fit(X_workday_smo,Y_workday_smo)
# cross_val_score(AdaBoost_clf, X_workday_smo, Y_workday_smo, cv=kfold, scoring='f1_macro').mean()

AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

0.676871729308702