In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import StratifiedKFold
import eli5

In [2]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
# #Merged User info, activity log and training and testing data on user-merchant pairs
# df = pd.read_csv("/Users/Vishal/Project_1/final.csv")

# #Replace null values in columns with corresponding values

# df['age_range'] = df['age_range'].fillna(0)
# df['label'] = df['label'].fillna(2)
# df['gender'] = df['gender'].fillna(2)
# df['brand_id'] = df['brand_id'].fillna(0)

# # Convert time stamp to Year-Month-Date format
# df['time_stamp'] = pd.to_datetime(df['time_stamp'], format='%m%d', errors='coerce')

In [4]:
#Training data from format 1
df_train = pd.read_csv("/Users/Vishal/Project_1/data_format1/train_format1.csv")

# LOAD FEATURES

In [55]:
df_combined = pd.read_csv("/Users/Vishal/Project_1/df_combined.csv")

In [56]:
#Sort
df_combined = df_combined.sort_values(['user_id','seller_id'])
df_train = df_train.sort_values(['user_id','seller_id'])

final_feat = (df_combined.drop(['user_id', 'seller_id', 'label','age_gender', 'user_merchant_pair'], axis=1))

final_label = df_train['label']

# MODEL

# 5-FOLD STRATIFIED SPLIT 

In [57]:
def model_xgboost(x, y):
    
    model = xgb.XGBClassifier(max_depth = 6)
    
    skf = StratifiedKFold(n_splits = 5, random_state = 2)
    
    AUC_valid = []
    AUC_train = []
    
    for train_index, valid_index in skf.split(x, y):
    
        #print("TRAIN:", train_index, "VALID:", valid_index)
    
        X_train, X_valid = x.iloc[train_index], x.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    
        model.fit(X_train, y_train, eval_metric='auc')
        
        auc_valid = roc_auc_score(y_valid, model.predict_proba(X_valid)[:,1])
        #print(auc_valid)
        AUC_valid.append(auc_valid)
    
        auc_train = roc_auc_score(y_train, model.predict_proba(X_train)[:,1])
        #print(auc_train)
        AUC_train.append(auc_train)
        
        print('Iteration Complete!!')
        
    def mean(z):
        return sum(z)/len(z)
    
    print("Mean auc for validation = ", mean(AUC_valid), "\t Mean auc for valid baseline = 0.6498")
    print("Mean auc for train = ", mean(AUC_train), "\t Mean auc for train baseline = 0.6629")
        
    feat_imp_abs = model.feature_importances_
    feat_imp_abs = pd.Series(data=feat_imp_abs,dtype='float')
    feat_imp_norm = feat_imp_abs.div(feat_imp_abs.max())*100    
        
    feature_importance = list(final_feat)
    feature_importance = pd.DataFrame(feature_importance)
    feature_importance['importance_abs'] = feat_imp_abs
    feature_importance['importance_norm'] = feat_imp_norm
    feature_importance = feature_importance.rename(columns={0: "feature"})
    
    print(feature_importance.sort_values('importance_norm'))

In [58]:
# xgb_model = xgb.XGBClassifier(max_depth=3)

In [59]:
# # Max depth 3
# xgb_model.fit(X_train, y_train)

In [60]:
# roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:,1])

In [61]:
# # Max depth 4
# xgb_model = xgb.XGBClassifier(max_depth=4)

# xgb_model.fit(X_train, y_train)


In [62]:
# roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:,1])

In [63]:
# # Max depth 5
# xgb_model = xgb.XGBClassifier(max_depth=5)

# xgb_model.fit(X_train, y_train)


In [64]:
# roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:,1])

In [65]:
# # Max depth 6
# xgb_model = xgb.XGBClassifier(max_depth=6)

# xgb_model.fit(X_train, y_train)


In [66]:
# roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:,1])

# MODEL RESULTS

In [67]:
model_xgboost(final_feat,final_label)

Iteration Complete!!
Iteration Complete!!
Iteration Complete!!
Iteration Complete!!
Iteration Complete!!
Mean auc for validation =  0.6784846625584459 	 Mean auc for valid baseline = 0.6498
Mean auc for train =  0.7938491879476313 	 Mean auc for train baseline = 0.6629
                                            feature  importance_abs  importance_norm
137                               userselleronsale1        0.000000         0.000000
138                               userselleronsale2        0.000000         0.000000
134                           usersellerbeforesale1        0.000000         0.000000
79                         unique_users_gender_norm        0.000000         0.000000
151                         seller_count_of_label-1        0.000000         0.000000
78                              gender_avg_purchase        0.000000         0.000000
164         ratio_item_int_union_user_before_seller        0.000000         0.000000
152          brand_intersection_user_before_on_sal

In [None]:
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import classification_report
# from sklearn.metrics import roc_auc_score
# from sklearn.metrics import roc_curve
# import seaborn as sns
# import matplotlib.pyplot as plt

# def roc_auc(X_test,y_test,y_pred,model):
# #    confusion_mat = confusion_matrix(y_test, y_pred)
# #    print('Confusion_Matrix: ',confusion_mat,sep='\n')
# #    print()
# #    print('Accuracy of classifier on test set: {:.2f}'.format(model.score(X_test, y_test)))
# #    print()
   
# #    # Classification Report
# #    print('Classification Report: ')
# #    print(classification_report(y_test, y_pred))
# #    y_score = model.predict_proba(X_test)[:,1]              # Predicted probability score
# #    fpr, tpr, thresholds = roc_curve(y_test, y_score)
# #    plt.figure()
   
#    # AuC Score & plotting of AuC curve
#    plt.plot(fpr, tpr, label='AuC score (area = %0.2f)' % roc_auc_score(y_test, y_score))
#    plt.plot([0, 1], [0, 1],'r--')
#    plt.xlim([0.0, 1.0])
#    plt.ylim([0.0, 1.05])
#    plt.xlabel('False Positive Rate')
#    plt.ylabel('True Positive Rate')
#    plt.title('Receiver operating characteristic')
#    plt.legend(loc="lower right")

In [None]:
# roc_auc(xval1,yval1,y_pred,regressor)

# LightGBM

In [18]:
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [19]:
X_train, X_test, y_train, y_test = train_test_split(final_feat, final_label, stratify=final_label, test_size=.2, random_state=42)


In [20]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [21]:
params = {'max_depth' : 10,
    'num_leaves': 5,
    'metric': ['l1', 'l2'],
    'verbose': -1
}

In [22]:
# Training

In [23]:
evals_result = {}  # to record eval results for plotting
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_test],
                evals_result=evals_result,
                verbose_eval=10)

[10]	training's l1: 0.113915	training's l2: 0.0567144	valid_1's l1: 0.113904	valid_1's l2: 0.0567737
[20]	training's l1: 0.113414	training's l2: 0.0564414	valid_1's l1: 0.113399	valid_1's l2: 0.0565236
[30]	training's l1: 0.113078	training's l2: 0.056262	valid_1's l1: 0.11309	valid_1's l2: 0.0563785
[40]	training's l1: 0.11284	training's l2: 0.0561205	valid_1's l1: 0.112849	valid_1's l2: 0.0562719
[50]	training's l1: 0.112619	training's l2: 0.0559909	valid_1's l1: 0.112623	valid_1's l2: 0.0561551
[60]	training's l1: 0.112427	training's l2: 0.0558787	valid_1's l1: 0.112424	valid_1's l2: 0.0560675
[70]	training's l1: 0.112265	training's l2: 0.0557857	valid_1's l1: 0.112274	valid_1's l2: 0.0559988
[80]	training's l1: 0.112124	training's l2: 0.0557013	valid_1's l1: 0.112161	valid_1's l2: 0.0559422
[90]	training's l1: 0.112011	training's l2: 0.0556372	valid_1's l1: 0.112073	valid_1's l2: 0.0559084
[100]	training's l1: 0.111906	training's l2: 0.0555748	valid_1's l1: 0.111998	valid_1's l2: 0.

In [24]:
y_pred = gbm.predict(X_test)

In [25]:
print(roc_auc_score(y_test, gbm.predict(X_test)))

0.6623238745039624


In [26]:
def render_metric(metric_name):
    ax = lgb.plot_metric(evals_result, metric=metric_name, figsize=(10, 5))
    plt.show()

In [27]:
import matplotlib.pyplot as plt

%matplotlib inline

try:
    # To enable interactive mode you should install ipywidgets
    # https://github.com/jupyter-widgets/ipywidgets
    from ipywidgets import interact, SelectMultiple
    INTERACTIVE = True
except ImportError:
    INTERACTIVE = False

In [28]:
if INTERACTIVE:
    # create widget to switch between metrics
    interact(render_metric, metric_name=params['metric'])
else:
    render_metric(params['metric'][0])

interactive(children=(Dropdown(description='metric_name', options=('l1', 'l2'), value='l1'), Output()), _dom_c…

In [29]:
def render_plot_importance(importance_type, max_features=10,
                           ignore_zero=True, precision=3):
    ax = lgb.plot_importance(gbm, importance_type=importance_type,
                             max_num_features=max_features,
                             ignore_zero=ignore_zero, figsize=(12, 8),
                             precision=precision)
    plt.show()

In [30]:

if INTERACTIVE:
    # create widget for interactive feature importance plot
    interact(render_plot_importance,
             importance_type=['split', 'gain'],
             max_features=(1, X_train.shape[-1]),
             precision=(0, 10))
else:
    render_plot_importance(importance_type='split')

interactive(children=(Dropdown(description='importance_type', options=('split', 'gain'), value='split'), IntSl…

# Model without seller features

In [31]:
# Without Seller monthly features

df_combined.iloc[:,pd.np.r_[0:93, 121:169]]

Unnamed: 0,user_id,seller_id,label,age_range,gender,userbeforesale0,userbeforesale1,userbeforesale2,userbeforesale3,useronsale0,useronsale1,useronsale2,useronsale3,user_ratio_0_to_2_before_sale,user_ratio_3_to_2_before_sale,user_ratio_0_to_2_on_sale,user_ratio_3_to_2_on_sale,active_days,purchase_days,useractivity5,useractivity6,useractivity7,useractivity8,useractivity9,useractivity10,useractivity11,userpurchase5,userpurchase6,userpurchase7,userpurchase8,userpurchase9,userpurchase10,userpurchase11,useravgactivity5,useravgactivity6,useravgactivity7,useravgactivity8,useravgactivity9,useravgactivity10,useravgactivity11,useravgpurchase5,useravgpurchase6,useravgpurchase7,useravgpurchase8,useravgpurchase9,useravgpurchase10,useravgpurchase11,user_unique_brands_before_sale,user_unique_categories_before_sale,user_unique_items_before_sale,user_unique_sellers_before_sale,user_unique_brands_purchased_before_sale,user_unique_categories_purchased_before_sale,user_unique_items_purchased_before_sale,user_unique_sellers_purchased_before_sale,user_unique_brands_on_sale,user_unique_categories_on_sale,user_unique_items_on_sale,user_unique_sellers_on_sale,user_unique_brands_purchased_on_sale,user_unique_categories_purchased_on_sale,user_unique_items_purchased_on_sale,user_unique_sellers_purchased_on_sale,user_all_month_purchase,user_purchase_only_11,sellerbeforesale0,sellerbeforesale1,sellerbeforesale2,sellerbeforesale3,selleronsale0,selleronsale1,selleronsale2,selleronsale3,userseller0,userseller1,userseller2,userseller3,age_avg_activity,age_avg_purchase,unique_users_age_norm,gender_avg_activity,gender_avg_purchase,unique_users_gender_norm,age_gender,age_gender_avg_activity,age_gender_avg_purchase,unique_users_age_gender_norm,age_gender_active_days_norm,age_gender_purchase_days_norm,seller_ratio_0_to_2_before_sale,seller_ratio_3_to_2_before_sale,seller_ratio_0_to_2_on_sale,seller_ratio_3_to_2_on_sale,seller_unique_brands_before_sale,seller_unique_categories_before_sale,seller_unique_items_before_sale,seller_unique_users_before_sale,seller_unique_brands_purchased_before_sale,seller_unique_categories_purchased_before_sale,seller_unique_items_purchased_before_sale,seller_unique_users_purchased_before_sale,seller_unique_brands_on_sale,seller_unique_categories_on_sale,seller_unique_items_on_sale,seller_unique_users_on_sale,seller_unique_brands_purchased_on_sale,seller_unique_categories_purchased_on_sale,seller_unique_items_purchased_on_sale,seller_unique_users_purchased_on_sale,user_merchant_pair,usersellerbeforesale0,usersellerbeforesale1,usersellerbeforesale3,userselleronsale0,userselleronsale1,userselleronsale2,userselleronsale3,um_unique_brands_on_sale,um_unique_categories_on_sale,um_unique_items_on_sale,um_unique_brands_purchased_on_sale,um_unique_categories_purchased_on_sale,um_unique_items_purchased_on_sale,um_activity_before_sale,activity_diff_from_1111,activity_diff_from_second_last,purchase_diff_from_1111,purchase_diff_from_second_last,seller_count_of_label-1,brand_intersection_user_before_on_sale,brand_intersection_user_before_and_seller,ratio_brand_int_union_user_before_on_sale,ratio_brand_int_union_user_before_seller,cat_intersection_user_before_on_sale,cat_intersection_user_before_and_seller,ratio_cat_int_union_user_before_on_sale,ratio_cat_int_union_user_before_seller,item_intersection_user_before_on_sale,item_intersection_user_before_and_seller,ratio_item_int_union_user_before_on_sale,ratio_item_int_union_user_before_seller
130649,1,1019,1,3.0,1.0,14.0,0.0,2.0,0.0,13.0,0.0,4.0,0.0,7.000000,0.00,3.250000,0.0,4,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.484848,0.515152,0.000000,0.000000,0.000000,0.000000,0.000000,0.333333,0.666667,0.0,0.000000,0.000000,0.000000,0.000000,4.000000,17.000000,0.0,0.0,0.000000,0.0,0.0,1.0,4.0,5,3,8,5,2.0,2.0,2.0,2.0,4,3,4,4,1,1,1,1,0.0,0.000000,4836.0,4.0,707.0,447.0,1167.0,1.0,234.0,37.0,10.0,0.0,4.0,0.0,132.987954,7.753050,1.000000,99.741350,6.668382,0.425959,3.0_1.0,102.861818,6.632899,0.445214,0.316205,0.363071,6.840170,0.632249,4.987179,0.158120,2,8,143,2466,1.0,8.0,51.0,387.0,2,4,62,534,1,4,24,134,1_1019,0.0,0.0,0.0,10.0,0.0,4.0,0.0,1,1,1,1,1,1,0.0,0.179348,0.065574,0.168478,0.054645,387.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
131623,4,1186,0,0.0,0.0,42.0,0.0,0.0,0.0,7.0,0.0,1.0,0.0,0.000000,0.00,7.000000,0.0,9,0,0.240000,0.000000,0.240000,0.000000,0.140000,0.060000,0.320000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,12.0,0.000000,4.000000,0.000000,3.500000,3.000000,5.333333,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,10,11,26,10,0.0,0.0,0.0,0.0,3,3,3,3,1,1,1,1,0.0,0.023810,1122.0,5.0,53.0,53.0,343.0,1.0,75.0,2.0,4.0,0.0,1.0,0.0,105.742450,6.229536,0.852016,141.133806,8.156852,1.000000,0.0_0.0,117.407391,6.637107,0.895509,0.725957,0.730747,21.169811,1.000000,4.573333,0.026667,2,7,43,367,1.0,5.0,12.0,39.0,2,7,32,152,1,5,15,65,4_1186,0.0,0.0,0.0,4.0,0.0,1.0,0.0,1,1,1,1,1,1,0.0,0.913043,0.912568,0.000000,0.000000,39.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
45535,6,1356,0,4.0,0.0,211.0,0.0,15.0,15.0,6.0,0.0,2.0,0.0,14.066667,1.00,3.000000,0.0,55,11,0.024096,0.084337,0.184739,0.052209,0.369478,0.132530,0.152610,0.000000,0.176471,0.235294,0.000000,0.352941,0.058824,0.176471,3.0,5.250000,5.111111,1.444444,8.363636,2.357143,5.428571,0.0,1.5,1.333333,0.0,1.5,1.0,1.5,77,63,165,78,11.0,11.0,11.0,10.0,4,4,5,5,2,2,2,2,0.0,0.000000,10514.0,20.0,1002.0,710.0,3293.0,4.0,867.0,62.0,2.0,0.0,1.0,1.0,147.542249,9.620345,0.716419,141.133806,8.156852,1.000000,4.0_0.0,154.453702,9.947552,0.697396,0.743744,0.852931,10.493014,0.708583,3.798155,0.071511,3,16,168,3257,2.0,13.0,98.0,727.0,2,11,60,1218,2,7,35,820,6_1356,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1,1,1,1,1,1,2.0,0.989130,0.983607,0.847826,0.836066,727.0,0.0,0.0,0.000000,0.00,0.0,1.0,0.0,0.043478,0.0,0.0,0.0,0.0
45534,6,4249,0,4.0,0.0,211.0,0.0,15.0,15.0,6.0,0.0,2.0,0.0,14.066667,1.00,3.000000,0.0,55,11,0.024096,0.084337,0.184739,0.052209,0.369478,0.132530,0.152610,0.000000,0.176471,0.235294,0.000000,0.352941,0.058824,0.176471,3.0,5.250000,5.111111,1.444444,8.363636,2.357143,5.428571,0.0,1.5,1.333333,0.0,1.5,1.0,1.5,77,63,165,78,11.0,11.0,11.0,10.0,4,4,5,5,2,2,2,2,0.0,0.000000,9343.0,17.0,368.0,878.0,1246.0,1.0,232.0,21.0,5.0,0.0,1.0,1.0,147.542249,9.620345,0.716419,141.133806,8.156852,1.000000,4.0_0.0,154.453702,9.947552,0.697396,0.743744,0.852931,25.388587,2.385870,5.370690,0.090517,2,2,152,3527,1.0,2.0,55.0,278.0,2,2,93,513,1,1,37,217,6_4249,4.0,0.0,1.0,1.0,0.0,1.0,0.0,1,1,1,1,1,1,5.0,0.989130,0.983607,0.847826,0.836066,278.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
132612,7,1162,0,4.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,7.0,0.0,6.000000,0.00,0.000000,0.0,5,1,0.000000,0.285714,0.142857,0.000000,0.000000,0.000000,0.571429,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,1.333333,2.000000,0.000000,0.000000,0.000000,4.000000,0.0,0.0,0.000000,0.0,0.0,0.0,4.0,4,4,6,4,1.0,1.0,1.0,1.0,3,5,7,3,3,5,7,3,0.0,0.190476,12943.0,7.0,485.0,741.0,2811.0,3.0,510.0,33.0,0.0,0.0,4.0,0.0,147.542249,9.620345,0.716419,141.133806,8.156852,1.000000,4.0_0.0,154.453702,9.947552,0.697396,0.743744,0.852931,26.686598,1.527835,5.511765,0.064706,2,31,296,3163,1.0,22.0,107.0,246.0,2,22,230,616,1,16,134,241,7_1162,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1,2,4,1,2,4,0.0,0.842391,0.841530,0.005435,0.000000,246.0,1.0,0.0,0.333333,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120318,424162,1679,0,4.0,0.0,208.0,0.0,6.0,6.0,37.0,0.0,1.0,0.0,34.666667,1.00,37.000000,0.0,23,4,0.023256,0.027132,0.003876,0.031008,0.104651,0.108527,0.701550,0.285714,0.000000,0.000000,0.142857,0.428571,0.000000,0.142857,3.0,7.000000,1.000000,2.666667,5.400000,4.000000,36.200000,1.0,0.0,0.000000,1.0,3.0,0.0,1.0,62,48,150,63,5.0,6.0,6.0,5.0,16,14,30,17,1,1,1,1,0.0,0.000000,40271.0,90.0,1130.0,3432.0,19347.0,28.0,2914.0,153.0,20.0,0.0,1.0,0.0,147.542249,9.620345,0.716419,141.133806,8.156852,1.000000,4.0_0.0,154.453702,9.947552,0.697396,0.743744,0.852931,35.638053,3.037168,6.639327,0.052505,2,8,244,5778,1.0,7.0,154.0,613.0,2,7,127,3116,1,6,84,1893,424162_1679,19.0,0.0,0.0,1.0,0.0,1.0,0.0,1,1,1,1,1,1,19.0,0.978261,0.978142,0.978261,0.732240,613.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
207581,424163,3826,0,2.0,0.0,130.0,0.0,4.0,2.0,2.0,0.0,2.0,0.0,32.500000,0.50,1.000000,0.0,21,2,0.035714,0.164286,0.035714,0.028571,0.000000,0.185714,0.550000,0.500000,0.166667,0.000000,0.000000,0.000000,0.000000,0.333333,2.5,3.833333,5.000000,1.000000,0.000000,8.666667,12.833333,3.0,1.0,0.000000,0.0,0.0,0.0,2.0,33,25,104,34,2.0,3.0,4.0,2.0,3,4,4,3,2,2,2,2,0.0,0.000000,10521.0,60.0,867.0,848.0,12072.0,35.0,2590.0,226.0,3.0,0.0,1.0,0.0,101.852055,5.584574,0.473525,141.133806,8.156852,1.000000,2.0_0.0,116.004126,5.871127,0.451581,0.361705,0.325968,12.134948,0.978085,4.661004,0.087259,3,7,41,4012,2.0,6.0,25.0,655.0,3,6,29,3613,2,5,16,2362,424163_3826,3.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,1,1,1,1,3.0,0.978261,0.978142,0.978261,0.191257,655.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
34156,424164,606,0,7.0,0.0,102.0,7.0,0.0,4.0,19.0,0.0,3.0,0.0,0.000000,0.00,6.333333,0.0,22,0,0.007407,0.125926,0.066667,0.111111,0.140741,0.140741,0.407407,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.0,3.400000,4.500000,5.000000,4.750000,4.750000,13.750000,0.0,0.0,0.000000,0.0,0.0,0.0,3.0,35,17,94,36,0.0,0.0,0.0,0.0,6,7,15,6,3,2,3,3,0.0,0.071429,222822.0,269.0,3003.0,16727.0,37025.0,47.0,3225.0,650.0,17.0,0.0,1.0,1.0,150.495566,7.713387,0.062622,141.133806,8.156852,1.000000,7.0_0.0,169.742670,8.183084,0.063545,0.074476,0.063931,74.199800,5.570097,11.480620,0.201550,3,33,2169,33125,1.0,26.0,728.0,1987.0,3,24,579,8102,1,22,380,2510,424164_606,9.0,0.0,1.0,8.0,0.0,1.0,0.0,1,5,6,1,1,1,10.0,0.989130,0.989071,0.000000,0.000000,1987.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
35121,424167,1200,0,3.0,1.0,20.0,0.0,4.0,1.0,8.0,0.0,2.0,0.0,5.000000,0.25,4.000000,0.0,7,4,0.200000,0.000000,0.000000,0.000000,0.057143,0.257143,0.485714,0.166667,0.000000,0.000000,0.000000,0.000000,0.333333,0.500000,7.0,0.000000,0.000000,0.000000,2.000000,3.000000,5.666667,1.0,0.0,0.000000,0.0,0.0,1.0,1.5,8,9,11,9,4.0,4.0,4.0,4.0,2,3,5,4,1,1,1,1,0.0,0.000000,26932.0,34.0,9676.0,884.0,5765.0,14.0,1923.0,44.0,1.0,0.0,2.0,0.0,132.987954,7.753050,1.000000,99.741350,6.668382,0.425959,3.0_1.0,102.861818,6.632899,0.445214,0.316205,0.363071,2.783382,0.091360,2.997920,0.022881,2,17,169,9425,1.0,10.0,48.0,3628.0,2,16,113,2248,1,12,73,1061,424167_1200,0.0,0.0,0.0,1.0,0.0,2.0,0.0,1,1,1,1,1,1,0.0,0.902174,0.890710,0.902174,0.890710,3628.0,1.0,1.0,0.250000,0.25,0.0,1.0,0.0,0.125000,0.0,0.0,0.0,0.0


In [32]:
#Sort
df_combined = df_combined.sort_values(['user_id','seller_id'])
df_train = df_train.sort_values(['user_id','seller_id'])

final_feat = (df_combined.iloc[:,pd.np.r_[0:93, 121:169]]\
              .drop(['user_id', 'seller_id', 'label','age_gender', 'user_merchant_pair'], axis=1))

final_label = df_train['label']

In [33]:
model_xgboost(final_feat,final_label)

Iteration Complete!!
Iteration Complete!!
Iteration Complete!!
Iteration Complete!!
Iteration Complete!!
Mean auc for validation =  0.6747761287680629 	 Mean auc for valid baseline = 0.6498
Mean auc for train =  0.7782986218836969 	 Mean auc for train baseline = 0.6629
                                            feature  importance_abs  importance_norm
135         ratio_item_int_union_user_before_seller        0.000000         0.000000
79                         unique_users_gender_norm        0.000000         0.000000
106                           usersellerbeforesale1        0.000000         0.000000
109                               userselleronsale1        0.000000         0.000000
110                               userselleronsale2        0.000000         0.000000
123                         seller_count_of_label-1        0.000000         0.000000
125       brand_intersection_user_before_and_seller        0.000000         0.000000
128            cat_intersection_user_before_on_sal

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import seaborn as sns
import matplotlib.pyplot as plt

def roc_auc(X_test,y_test,y_pred,model):
#    confusion_mat = confusion_matrix(y_test, y_pred)
#    print('Confusion_Matrix: ',confusion_mat,sep='\n')
#    print()
#    print('Accuracy of classifier on test set: {:.2f}'.format(model.score(X_test, y_test)))
#    print()
   
#    # Classification Report
#    print('Classification Report: ')
#    print(classification_report(y_test, y_pred))
#    y_score = model.predict_proba(X_test)[:,1]              # Predicted probability score
#    fpr, tpr, thresholds = roc_curve(y_test, y_score)
#    plt.figure()
   
   # AuC Score & plotting of AuC curve
   plt.plot(fpr, tpr, label='AuC score (area = %0.2f)' % roc_auc_score(y_test, y_score))
   plt.plot([0, 1], [0, 1],'r--')
   plt.xlim([0.0, 1.0])
   plt.ylim([0.0, 1.05])
   plt.xlabel('False Positive Rate')
   plt.ylabel('True Positive Rate')
   plt.title('Receiver operating characteristic')
   plt.legend(loc="lower right")

# FOR PREDICTION VALUES

In [34]:
STOP

NameError: name 'STOP' is not defined

In [35]:
def model_xgboost_pred(x, y):
    
    model = xgb.XGBClassifier()
    
    skf = StratifiedKFold(n_splits = 5, random_state = 2)
    
    AUC_valid = []
    AUC_train = []
    
    for train_index, valid_index in skf.split(x, y):
    
        #print("TRAIN:", train_index, "VALID:", valid_index)
    
        X_train1, X_valid1 = x.iloc[train_index], x.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
        X_train = X_train1.drop(['user_id', 'seller_id', 'label','age_gender', 'user_merchant_pair'], axis=1)
        X_valid = X_valid1.drop(['user_id', 'seller_id', 'label','age_gender', 'user_merchant_pair'], axis=1)
    
        model.fit(X_train, y_train, eval_metric='auc')
        
        auc_valid = roc_auc_score(y_valid, model.predict_proba(X_valid)[:,1])
        #print(auc_valid)
        AUC_valid.append(auc_valid)
        Å
        X_train1['prediction'] = model.predict_proba(X_train)[:,1]
        X_valid1['prediction'] = model.predict_proba(X_valid)[:,1]
        
        auc_train = roc_auc_score(y_train, model.predict_proba(X_train)[:,1])
        #print(auc_train)
        AUC_train.append(auc_train)
        
        break
        
    def mean(z):
        return sum(z)/len(z)
    
    print("Mean auc for validation = ", mean(AUC_valid), "\t Mean auc for valid baseline = 0.6498")
    print("Mean auc for train = ", mean(AUC_train), "\t Mean auc for train baseline = 0.6629")
        
    feat_imp_abs = model.feature_importances_
    feat_imp_abs = pd.Series(data=feat_imp_abs,dtype='float')
    feat_imp_norm = feat_imp_abs.div(feat_imp_abs.max())*100    
        
    feature_importance = list(final_feat)
    feature_importance = pd.DataFrame(feature_importance)
    feature_importance['importance_abs'] = feat_imp_abs
    feature_importance['importance_norm'] = feat_imp_norm
    feature_importance = feature_importance.rename(columns={0: "feature"})
    
    print(feature_importance.sort_values('importance_norm'))
    
    return X_train1, X_valid1

In [None]:
#Sort
df_combined = df_combined.sort_values(['user_id','seller_id'])
df_train = df_train.sort_values(['user_id','seller_id'])

final_feat = (df_combined.drop(['user_id', 'seller_id', 'label','age_gender', 'user_merchant_pair'], axis=1))

final_label = df_train['label']

# To add prediction to features df

In [None]:
# X_train1,X_valid1 = model_xgboost(df_combined, final_label)

# prediction = pd.concat([X_train1,X_valid1],ignore_index = True)

# cols = prediction.columns.tolist()

# cols = cols[-1:] + cols[:-1]

# prediction = prediction[cols]

# prediction = prediction.sort_values('prediction', ascending = False)

# prediction.to_csv("prediction1.csv",index=False)

# Grid Search for Parameter Tuning

In [None]:
# gs = GridSearchCV(xgb_model, {"model__max_depth": [5, 10],
#                               "model__min_child_weight": [5, 10],
#                               "model__n_estimators": [25]},
#                   n_jobs=-1, cv=5, scoring="accuracy")

# gs.fit(X_train, y_train)

In [None]:
# print(gs.best_params_)
# print(gs.best_score_)
# xgb_model.set_params(**gs.best_params_)
# xgb_model.fit(X_train, y_train)

In [None]:
roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:,1])

In [None]:
eli5.show_weights(xgb_model, top = 200)

In [50]:
model_eli5 = xgb.XGBClassifier(max_depth = 6)

skf = StratifiedKFold(n_splits = 2, random_state = 2)

In [51]:
for train_index, valid_index in skf.split(final_feat, final_label):
    
    X_train, X_valid = final_feat.iloc[train_index], final_feat.iloc[valid_index]
    y_train, y_valid = final_label.iloc[train_index], final_label.iloc[valid_index]

In [52]:
model_eli5.fit(X_train, y_train, eval_metric='auc')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [53]:
features = list(final_feat.columns)

In [54]:
eli5.show_prediction(xgb_model, 
                     X_test.iloc[[4]],
                     feature_names=features, show_feature_values=True, top = 50)

NameError: name 'xgb_model' is not defined

In [None]:
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import classification_report
# from sklearn.metrics import roc_auc_score
# from sklearn.metrics import roc_curve
# import seaborn as sns
# import matplotlib.pyplot as plt

# def roc_auc(x,y,y_pred,model):
#    confusion_mat = confusion_matrix(y, y_pred)
#    print('Confusion_Matrix: ',confusion_mat,sep='\n')
#    print()
#    print('Accuracy of classifier on test set: {:.2f}'.format(model.score(x, y)))
#    print()
   
#    # Classification Report
#    print('Classification Report: ')
#    print(classification_report(y, y_pred))
#    y_score = model.predict_proba(X_test)[:,1]              # Predicted probability score
#    fpr, tpr, thresholds = roc_curve(y, y_score)
#    plt.figure()
   
#    # AuC Score & plotting of AuC curve
#    plt.plot(fpr, tpr, label='AuC score (area = %0.2f)' % roc_auc_score(y, y_score))
#    plt.plot([0, 1], [0, 1],'r--')
#    plt.xlim([0.0, 1.0])
#    plt.ylim([0.0, 1.05])
#    plt.xlabel('False Positive Rate')
#    plt.ylabel('True Positive Rate')
#    plt.title('Receiver operating characteristic')
#    plt.legend(loc="lower right")



In [None]:
roc_auc(xval1,yval1,y_pred,regressor)

In [None]:
features = list(final_feat.columns)

In [49]:
eli5.show_prediction(xgb_model, 
                     X_test.iloc[[4]],
                     feature_names=features, show_feature_values=True, top = 20)

NameError: name 'xgb_model' is not defined

In [None]:
eli5.show_prediction(xgb_model, 
                     X_test.iloc[[10]],
                     feature_names=features, show_feature_values=True, top = 20)

In [None]:
eli5.show_prediction(xgb_model, 
                     X_test.iloc[[78]],
                     feature_names=features, show_feature_values=True, top = 20)

In [None]:
eli5.show_prediction(xgb_model, 
                     X_test.iloc[[21]],
                     feature_names=features, show_feature_values=True, top = 50)

In [None]:
eli5.show_prediction(xgb_model, 
                     X_test.iloc[[1570]],
                     feature_names=features, show_feature_values=True, top = 20)

In [None]:
eli5.show_prediction(xgb_model, 
                     X_test.iloc[[5570]],
                     feature_names=features, show_feature_values=True, top = 20)

In [None]:
eli5.show_prediction(xgb_model, 
                     X_test.iloc[[5570]],
                     feature_names=features, show_feature_values=True, top = 20)

In [None]:
X_test

In [None]:
from lime.lime_tabular import LimeTabularExplainer

In [None]:
xgb_predict_proba = xgb_model.predict_proba(X_test)[:,1]

In [None]:
explanation = explainer.explain_instance(final_feat, xgb_predict_proba, num_features=5)

In [None]:
pip install shap

In [None]:
import shap
# Need to load JS vis in the notebook
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(xgb_model)

In [None]:
explainer

In [None]:
shap_values = explainer.shap_values(X_train, y_train)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])

In [None]:
shap.force_plot(explainer.expected_value, shap_values[5,:], X_train.iloc[5,:])

In [None]:
shap.force_plot(explainer.expected_value, shap_values[1000,:], X_train.iloc[1000,:])

In [None]:
shap.force_plot(explainer.expected_value, shap_values[50,:], X_train.iloc[50,:])

# Logistic Regression

In [None]:
# from sklearn.linear_model import LogisticRegression

In [None]:
# model = LogisticRegression(random_state=0)

In [None]:
# skf = StratifiedKFold(n_splits = 5, random_state = 0)

# for train_index, valid_index in skf.split(final_feat, final_label):
#     print("TRAIN:", train_index, "VALID:", valid_index)
#     X_train, X_valid = final_feat.iloc[train_index], final_feat.iloc[valid_index]
#     y_train, y_valid = final_label.iloc[train_index], final_label.iloc[valid_index]
    
#     model.fit(X_train, y_train)
    
#     print( roc_auc_score(y_valid, model.predict_proba(X_valid)[:,1]) )
    
#     #accuracy = model.score(X_valid, y_valid)
    
#     #y_pred = model.predict(X_valid)
    
#     #print(accuracy)

In [None]:
# y_pred = model.predict(X_valid)

In [None]:
# confusion_matrix(y_valid, y_pred)
