## Models!

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

%matplotlib inline 

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, KFold, GridSearchCV, GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve,auc,accuracy_score,recall_score,precision_score,f1_score
from xgboost import XGBClassifier

In [4]:
aisles = pd.read_csv('data/aisles.csv', dtype = {
    'aisle_id': np.uint16,
    'aisle': np.str},
    usecols=['aisle_id', 'aisle'])

In [5]:
departments = pd.read_csv('data/departments.csv', dtype = {
    'department_id': np.uint16,
    'department': np.str},
    usecols=['department_id', 'department'])

In [6]:
order_products_prior = pd.read_csv('data/order_products__prior.csv', dtype = {
    'order_id': np.uint32,
    'product_id': np.uint32,
    'add_to_cart_order': np.uint16,
    'reordered': np.uint16},
    usecols=['order_id', 'product_id', 'add_to_cart_order', 'reordered'])

In [7]:
order_product_train = pd.read_csv('data/order_products__train.csv', dtype = {
    'order_id': np.uint32,
    'product_id': np.uint32,
    'add_to_cart_order': np.uint16,
    'reordered': np.uint16},
    usecols=['order_id', 'product_id', 'add_to_cart_order', 'reordered'])

In [8]:
orders = pd.read_csv('data/orders.csv', dtype = {
    'order_id': np.uint32,
    'user_id' :np.uint32,
    'eval_set': 'category',
    'order_number': np.uint16,
    'order_dow': np.uint16,
    'order_hour_of_day': np.uint8,
    'days_since_prior_order': np.float32},
                     usecols=['order_id', 'user_id', 'eval_set', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order'])

In [9]:
products = pd.read_csv('data/products.csv', dtype = {'product_id': np.uint16,
        'product_name': np.str,
        'aisle_id': np.uint16,
        'department_id': np.uint16},
        usecols=['product_id', 'product_name', 'aisle_id', 'department_id'])

In [10]:
priordf = pickle.load(open('data/pickle_files/priordf.p', 'rb'))
user_info = pickle.load(open('data/pickle_files/user_info.p', 'rb'))
product_info = pickle.load(open('data/pickle_files/product_info.p', 'rb'))
user_product_info = pickle.load(open('data/pickle_files/user_product_info.p', 'rb'))
ordertime_info = pickle.load(open('data/pickle_files/ordertime_info.p', 'rb'))
user_ordertime_info = pickle.load(open('data/pickle_files/user_ordertime_info.p', 'rb'))
product_ordertime_info = pickle.load(open('data/pickle_files/product_ordertime_info.p', 'rb'))
aisle_info = pickle.load(open('data/pickle_files/aisle_info.p', 'rb'))
department_info = pickle.load(open('data/pickle_files/department_info.p', 'rb'))

In [11]:
master_prior_df = pd.merge(user_product_info, user_info, on = 'user_id', how = 'left')

In [12]:
master_prior_df = pd.merge(master_prior_df, product_info, on = 'product_id', how = 'left')

In [13]:
master_prior_df = pd.merge(master_prior_df, aisle_info, on = 'aisle_id', how = 'left', suffixes=('', '_y'))

In [14]:
master_prior_df.drop('department_id_y', axis = 1, inplace=True)

In [15]:
master_prior_df = pd.merge(master_prior_df, department_info, on = 'department_id', how = 'left', suffixes=('', '_y'))

In [16]:
train_users = orders[orders.eval_set == 'train'].user_id

In [17]:
train_df = master_prior_df[master_prior_df.user_id.isin(train_users)]

In [18]:
train_df.head()

Unnamed: 0,user_id,product_id,user_product_order_num,user_product_last_purchase_day,user_product_order_interval_mean,user_product_order_interval_std,user_product_rank,user_product_add_order_mean,user_product_add_order_std,user_product_reorder_ratio,user_product_order_interval_mean_NaN,user_product_order_interval_std_NaN,user_product_add_order_std_NaN,user_order_num,user_order_interval_mean,user_order_interval_std,user_basket_size_mean,user_basket_size_std,user_history,product_user_num,product_order_num,product_order_interval_mean,product_order_interval_std,product_reorder_num,product_reorder_user_num,product_reorder_ratio,product_reorder_user_ratio,product_add_to_cart_order_mean,product_add_to_cart_order_std,product_name,aisle_id,department_id,product_order_interval_mean_NaN,product_order_interval_std_NaN,product_add_to_cart_order_std_NaN,aisle_prod_user_num_sum,aisle_prod_user_num_mean,aisle_prod_user_num_std,aisle_prod_order_num_sum,aisle_prod_order_num_mean,aisle_prod_order_num_std,aisle_prod_order_interval_mean_mean,aisle_prod_order_interval_mean_std,aisle_prod_reorder_num_sum,aisle_prod_reorder_num_mean,aisle_prod_reorder_num_std,aisle_prod_reorder_user_num_sum,aisle_prod_reorder_user_num_mean,aisle_prod_reorder_user_num_std,aisle_prod_add_to_cart_order_mean,aisle_prod_add_to_cart_order_std,aisle_reorder_ratio,aisle_user_reorder_ratio,department_prod_user_num_sum,department_prod_user_num_mean,department_prod_user_num_std,department_prod_order_num_sum,department_prod_order_num_mean,department_prod_order_num_std,department_prod_add_to_cart_order_mean,department_prod_add_to_cart_order_std,department_prod_reorder_num_sum,department_prod_reorder_num_mean,department_prod_reorder_num_std,department_prod_reorder_user_num_sum,department_prod_reorder_user_num_mean,department_prod_reorder_user_num_std
0,1,196,10,176,19.555555,9.395625,1.5,1.4,0.966092,1.0,0,0,0,10,19.555555,9.395625,5.9,1.523884,176,8000,35791,22.623835,26.001011,27791.0,4660.0,0.77648,0.5825,3.721774,4.110813,Soda,77,7,0,0,0,129131,278.900648,736.904549,357537,772.218143,2415.057166,25.996077,14.911694,228406.0,493.317495,1726.669834,55534.0,119.943844,354.541349,7.647766,2.189116,0.638832,0.430059,932237,228.512015,192.511727,2690129,690.856125,753.685494,8.079681,0.705206,1757892.0,462.34411,562.454967,408690.0,103.938473,108.233336
1,1,10258,9,176,20.125,9.876922,3.0,3.333333,1.322876,0.9,0,0,0,10,19.555555,9.395625,5.9,1.523884,176,557,1946,24.268539,26.848476,1389.0,308.0,0.713772,0.552962,4.277493,3.567502,Pistachios,117,19,0,0,0,147368,253.209622,676.108087,306487,526.609966,1478.642046,31.982382,18.95903,159119.0,273.400344,845.712074,50103.0,86.087629,248.092359,9.712533,2.255376,0.51917,0.339986,1229577,184.277748,78.117275,2887550,429.355375,201.13544,9.605942,0.644157,1657973.0,245.077627,125.815728,483357.0,70.903977,35.028622
2,1,10326,1,93,42.677544,25.335363,14.5,5.0,4.219923,0.1,1,1,1,10,19.555555,9.395625,5.9,1.523884,176,1923,5526,26.605606,36.763191,3603.0,1003.0,0.652009,0.521581,4.191097,3.611701,Organic Fuji Apples,24,4,0,0,0,1026719,2687.746073,8357.649783,3642188,9534.52356,40105.378548,24.10531,12.450514,2615469.0,6846.777487,32174.918772,560354.0,1466.895288,5410.202963,8.672981,2.014458,0.718104,0.545772,3318581,2190.172202,683.52471,9479291,6286.046039,2799.902559,8.410426,1.865486,6160710.0,4095.873837,2222.922021,1598097.0,1047.023084,403.342103
3,1,12427,10,176,19.555555,9.395625,1.5,3.3,2.406011,1.0,0,0,0,10,19.555555,9.395625,5.9,1.523884,176,1679,6476,19.953512,25.316103,4797.0,889.0,0.740735,0.529482,4.760037,4.78245,Original Beef Jerky,23,19,0,0,0,66720,211.139241,565.300465,163524,517.481013,1572.546061,31.39245,14.166268,96804.0,306.341772,1018.486136,27658.0,87.525316,265.427758,9.348668,2.149298,0.591986,0.414538,1229577,184.277748,78.117275,2887550,429.355375,201.13544,9.605942,0.644157,1657973.0,245.077627,125.815728,483357.0,70.903977,35.028622
4,1,13032,3,176,80.5,51.618793,5.5,6.333333,1.527525,0.3,0,0,0,10,19.555555,9.395625,5.9,1.523884,176,1286,3751,31.215416,37.182076,2465.0,617.0,0.657158,0.479782,5.622767,5.345184,Cinnamon Toast Crunch,121,14,0,0,0,161764,356.30837,824.174526,377586,831.687225,1973.034402,30.660225,10.067678,215822.0,475.378855,1157.061839,66813.0,147.165198,359.088257,9.421302,2.247966,0.571584,0.413028,311556,252.92822,78.002364,709569,580.795337,177.463013,9.460799,0.09138,398013.0,327.867117,105.55097,122600.0,99.369176,32.916165


In [19]:
train_df.shape

(8474661, 67)

In [20]:
train_df = pd.merge(train_df, orders[orders.eval_set == 'train'], on='user_id', how = 'left')

In [21]:
train_df = pd.merge(train_df, order_product_train, on =['order_id', 'product_id'], how = 'left')

In [22]:
train_df.reordered.fillna(0, inplace = True)

In [23]:
train_df.rename(columns = {'reordered': 'y'}, inplace = True)

In [24]:
X = train_df.drop(['aisle_id', 'department_id', 'order_id', 'eval_set', 'order_number', 'add_to_cart_order', 'user_id', 'product_id', 'product_name', 'y'], axis = 1)

In [25]:
y = train_df.y

In [26]:
def new_f1_score(df):
    TNdf = pd.DataFrame(df[(df.y == 0) & (df.XGB_pred_y == 0)].groupby('user_id')['product_id'].nunique())
    TNdf.rename(columns = {'product_id': 'TN'}, inplace = True)
    TNdf.reset_index(inplace = True)
    
    TPdf = pd.DataFrame(df[(df.y == 1) & (df.XGB_pred_y == 1)].groupby('user_id')['product_id'].nunique())
    TPdf.rename(columns = {'product_id': 'TP'}, inplace = True)
    TPdf.reset_index(inplace = True)
    
    FNdf = pd.DataFrame(df[(df.y == 1) & (df.XGB_pred_y == 0)].groupby('user_id')['product_id'].nunique())
    FNdf.rename(columns = {'product_id': 'FN'}, inplace = True)
    FNdf.reset_index(inplace = True)
    
    FPdf = pd.DataFrame(df[(df.y == 0) & (df.XGB_pred_y == 1)].groupby('user_id')['product_id'].nunique())
    FPdf.rename(columns = {'product_id': 'FP'}, inplace = True)
    FPdf.reset_index(inplace = True)
    
    matrix_df = pd.merge(TNdf, TPdf, on = 'user_id', how = 'outer')
    matrix_df = pd.merge(matrix_df, FNdf, on = 'user_id', how = 'outer')
    matrix_df = pd.merge(matrix_df, FPdf, on = 'user_id', how = 'outer')
    
    matrix_df.fillna(0, inplace = True)
    matrix_df['precision'] = matrix_df.TP / (matrix_df.FP + matrix_df.TP)
    matrix_df['recall'] = matrix_df.TP / (matrix_df.FN + matrix_df.TP)
    matrix_df['f1'] = 2*matrix_df.TP / (2*matrix_df.TP+matrix_df.FP+matrix_df.FN)
    matrix_df.fillna(1, inplace = True)
    return matrix_df.f1.mean()

In [97]:
def man_cross_val(masterdf, learning_rate=0.2, max_depth=6, n_estimators=100, gamma=0, min_child_weight=1, subsample=1, threshold=0.2):
    X = masterdf.drop(['aisle_id', 'department_id', 'order_id', 'eval_set', 'order_number', 'add_to_cart_order', 'user_id', 'product_id', 'product_name', 'y'], axis = 1)
    y = masterdf.y
    kf = GroupKFold(n_splits=5)
    final_df = pd.DataFrame()
    for i, (train_index, test_index) in enumerate(kf.split(masterdf, groups=masterdf['user_id'].values)):
        xgb_model = XGBClassifier(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators, gamma=gamma, min_child_weight=min_child_weight, subsample=subsample, n_jobs=-1)
        xgb_fit = xgb_model.fit(X.iloc[train_index], y.iloc[train_index])
        result = xgb_fit.predict_proba(X.iloc[test_index])
        new_df = masterdf.iloc[test_index][['y', 'user_id', 'product_id']]
        new_df['True'] = result[:, 1:]
        new_df['set'] = i
        final_df = pd.concat([final_df, new_df])
    return final_df

In [82]:
def optimize_threshold(result_df, threshold_list=[0.2]):
    threshold_opt = 0.2
    max_score = 0
    for threshold in threshold_list:
        scores = []
        bins = [0, threshold, 1]
        group_names = [0, 1]
        result_df['XGB_pred_y'] = pd.cut(result_df['True'], bins, labels=group_names).astype(np.float32)
        for i in range(5):
            scores.append(new_f1_score(result_df[result_df.set == i]))
        score = np.mean(scores)
        #print(threshold, score)
        if score > max_score:
            max_score = score
            threshold_opt = threshold
    return threshold_opt, max_score

In [122]:
def tune_para_xgb(masterdf, learning_rate_list=[0.2], max_depth_list=[6], n_estimators_list=[100], gamma_list=[0], min_child_weight_list=[1], subsample_list=[1], threshold_list=[0.2]):
    max_score = 0
    parameters = [0.2, 6, 100, 0, 1, 1, 0.2]
    cols = ['learning_rate', 'max_depth', 'n_estimators', 'gamma', 'min_child_weight_list', 'subsapmle', 'opt_threshold', 'score']
    parameter_df = pd.DataFrame(columns = cols)
    for lr in learning_rate_list:
        for md in max_depth_list:
            for n in n_estimators_list:
                for g in gamma_list:
                    for w in min_child_weight_list:
                        for s in subsample_list:
                            print(lr, md, n, g, w, s)
                            temp = optimize_threshold(man_cross_val(masterdf, learning_rate=lr, max_depth=md, n_estimators=n, gamma=g, min_child_weight=w, subsample=s), threshold_list=threshold_list)
                            score = temp[1]
                            threshold_opt = temp[0]
                            parameter_df = parameter_df.append(pd.DataFrame([[lr, md, n, g, w, s, threshold_opt, score]], columns=cols))
                            print(score, threshold_opt)
                            if score > max_score:
                                max_score = score
                                parameters = [lr, md, n, g, w, s, threshold_opt]
    return max_score, parameters, parameter_df

In [84]:
# learning_rate_list = np.arange(0.05,0.3,0.05)
# max_depth_list = [3]
# n_estimators_list = np.arange(20,100,20)
# gamma_list = [0, 0.05]
# min_child_weight_list = [1]
# threshold_list = np.arange(0.1,0.4,0.001)
# subsample_list = [0.8]

In [102]:
learning_rate_list = np.arange(0.1,0.3,0.05)
max_depth_list = [3, 6, 9]
n_estimators_list = np.arange(20,100,20)
gamma_list = [0]
min_child_weight_list = [1, 3]
threshold_list = np.arange(0.1,0.4,0.001)
subsample_list = [0.8, 1]

In [116]:
import copy
masterdf = copy.deepcopy(train_df.iloc[0:1000, :])

In [123]:
parameter_optimize_result = tune_para_xgb(masterdf, learning_rate_list=learning_rate_list, max_depth_list=max_depth_list, n_estimators_list=n_estimators_list, gamma_list=gamma_list, min_child_weight_list=min_child_weight_list, subsample_list = subsample_list, threshold_list=threshold_list)

0.1 6 60 0 1 1
0.277056957378 0.105


In [124]:
parameter_optimize_result[2]

Unnamed: 0,learning_rate,max_depth,n_estimators,gamma,min_child_weight_list,subsapmle,opt_threshold,score
0,0.1,6,60,0,1,1,0.105,0.277057


In [None]:
pickle.dump(parameter_optimize_result, open('parameter_optimize_result.p', 'wb'))

## Prediction

In [None]:
lr = parameter_optimize_result[1][0]
md = parameter_optimize_result[1][1]
n = parameter_optimize_result[1][2]
g = parameter_optimize_result[1][3]
w = parameter_optimize_result[1][4]
s = parameter_optimize_result[1][5]
threshold = parameter_optimize_result[1][6]

In [None]:
print(lr, md, n, g, w, s, threshold)

In [None]:
xgb_optimized_model = XGBClassifier(learning_rate=lr, max_depth=md, n_estimators=n, gamma=g, min_child_weight=w, subsample=s, n_jobs=-1)

In [None]:
xgb_optimized_fit = xgb_optimized_model.fit(X,y)

In [None]:
test_users = orders[orders.eval_set == 'test'].user_id

In [None]:
test_df = master_prior_df[master_prior_df.user_id.isin(test_users)]

In [None]:
test_df.shape

In [None]:
test_df = pd.merge(test_df, orders[orders.eval_set == 'test'], on='user_id', how = 'left')

In [None]:
test_df = pd.merge(test_df, order_product_train, on =['order_id', 'product_id'], how = 'left')

In [None]:
test_df.drop('reordered', axis = 1, inplace = True)

In [None]:
X_test = test_df.drop(['aisle_id', 'department_id', 'order_id', 'eval_set', 'order_number', 'add_to_cart_order', 'user_id', 'product_id', 'product_name'], axis = 1)

In [None]:
test_df['True'] = xgb_optimized_fit.predict_proba(X_test)[:,1:]

In [None]:
bins = [0, threshold, 1]
group_names = [0, 1]

In [None]:
test_df['XGB_pred_y'] = pd.cut(test_df['True'], bins, labels=group_names)

In [None]:
test_df.order_id.nunique()

In [None]:
test_df.head()

In [None]:
result = test_df[test_df.XGB_pred_y == 1][['order_id', 'product_id']].reset_index(drop = True)

In [None]:
submission = pd.DataFrame(result.groupby('order_id').aggregate(lambda x: tuple(x)), columns = ['product_id']).reset_index()

In [None]:
submission['products'] = submission.product_id.apply(lambda x: ' '.join([str(a) for a in x]))

In [None]:
submission.drop('product_id', axis = 1, inplace=True)

In [None]:
submission.head()

In [None]:
new_submission = pd.merge(orders[orders.eval_set == 'test'], submission, on = 'order_id', how = 'left')

In [None]:
new_submission = new_submission[['order_id', 'products']]

In [None]:
new_submission = new_submission.sort_values('order_id').reset_index(drop =True)

In [None]:
new_submission.to_csv('submission.csv', index=False)

In [None]:
new_submission.shape