# first submission based on purchase history (events.csv) only

# PART-1: Data importing

In [21]:
import pandas as pd
import numpy as np
import datetime
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [2]:
df_events = pd.read_csv('events.csv',  encoding = "UTF-8")

In [3]:
df_events[df_events['event']=='8'].count()

app_id             265034
session_id         265034
event              265034
event_timestamp    265034
event_value        265034
user_id_hash       265034
dtype: int64

# PART-2: feature engineering: compute weekly-x

In [4]:
df_events['if_purchase'] = df_events['event'].map(lambda x: 1 if x == "8" else 0)
df_events['date'] = pd.to_datetime(df_events['event_timestamp'],unit='ms')

In [5]:
df_events['day']=df_events['date'].apply(lambda dt: datetime.datetime(dt.year, 
                                                                      dt.month, dt.day))

In [6]:
df_event_tiny=df_events[["user_id_hash","day",
                         "if_purchase",
                         "event_value"]].copy()

In [7]:
all_entries = df_event_tiny[df_event_tiny['if_purchase']==1]
all_entries.day = all_entries.day.apply(lambda x: datetime.datetime.strftime(x, '%m%d'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [8]:
all_entries.head(3)

Unnamed: 0,user_id_hash,day,if_purchase,event_value
279,9943447915df3a45fd6720a026af905b6da6b56a37701b...,1111,1,3.493
2898,deaf54103e439789b069a95c4650dc6cdc24d6c9d700f1...,1127,1,3.493
2970,deaf54103e439789b069a95c4650dc6cdc24d6c9d700f1...,1124,1,3.493


In [9]:
all_entries_sum = all_entries.groupby(['user_id_hash','day'], as_index=False)[['if_purchase','event_value']].sum()
all_entries_sum.head(2)

Unnamed: 0,user_id_hash,day,if_purchase,event_value
0,00000db35682058bb7916cb90f85709f54c1a0f7a3b6de...,1111,7,19.551
1,00000db35682058bb7916cb90f85709f54c1a0f7a3b6de...,1112,1,3.493


In [10]:
df_x = all_entries_sum.pivot(index='user_id_hash', 
                             columns='day', 
                             values='if_purchase').fillna(0)
df_x.head(5)

day,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,...,1205,1206,1207,1208,1209,1210,1211,1212,1213,1214
user_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000db35682058bb7916cb90f85709f54c1a0f7a3b6de247d94b1bc20c36b97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000059859ec188af6035870faf885c3038cedda05b3a5480a8223649629d951e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000d99c8e82878915b33ffe27ac3585ce9fb7cd4b82ace9f684c5a7900cc3536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
00124c21b3ec87a2f17f884c5eee25462b67d489ebad09497a7a158ed9d7c4c1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
0012e392350f0f6408b8b1a03bc5ee292e29dc735ee24dd942f22432e22e8020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### convert pd.df to np.array to improve data manipulation efficiency

In [11]:
np_x = np.array(df_x)
np_x = np_x[:,5:75]

In [12]:
i=0
np_x_week = np.ones(34200,)
for i in range(10):
    np_x_week = np.vstack((np_x_week, np.array(np_x[:,(i*7):(i+1)*7].sum(axis=1))))
np_x_week = np_x_week.T
np_x_week = np.delete(np_x_week, 0, axis=1)

In [13]:
df_x_new=pd.DataFrame(data=np_x_week, index=df_x.index)


In [14]:
df_x_full_id=df_event_tiny[["user_id_hash"]].copy()
df_x_full_id = df_x_full_id.drop_duplicates(subset=["user_id_hash"])
df_x_full_id = pd.merge(df_x_full_id, df_x_new, how='left', on='user_id_hash').fillna(0)
df_x_full_id.head(5)

Unnamed: 0,user_id_hash,0,1,2,3,4,5,6,7,8,9
0,9943447915df3a45fd6720a026af905b6da6b56a37701b...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,43f75f8042d3c80c45e222bdd09267f4584684c54d6fae...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,999524249720812f2d8c0390293efd58e1ac84d587a01c...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,dc009148ee26d658e0240c7b7f6a258790a457737f96e8...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
def running_clfs(x_train,y_train,x_val,y_val):
    headers=["Model","AUC_training", "AUC_validation"]
    records=[]
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(random_state=0).fit(x_train, y_train)
    from sklearn import metrics
    y_train_hat = clf.predict_proba(x_train) [:,1]
    y_val_hat = clf.predict_proba(x_val) [:,1]
    records.append(("LogisticRegression",metrics.roc_auc_score(y_train, y_train_hat),
                                                              metrics.roc_auc_score(y_val, y_val_hat)))
    from sklearn import tree
    clf = tree.DecisionTreeClassifier()
    clf.fit(x_train, y_train)
    y_train_hat = clf.predict_proba(x_train) [:,1]
    y_val_hat = clf.predict_proba(x_val) [:,1]
    records.append(("DecisionTree",metrics.roc_auc_score(y_train, y_train_hat),
                                                              metrics.roc_auc_score(y_val, y_val_hat)))
 
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(max_depth=10, n_estimators=3, max_features=1)
    clf.fit(x_train, y_train)
    y_train_hat = clf.predict_proba(x_train) [:,1]
    y_val_hat = clf.predict_proba(x_val) [:,1]
    metrics.roc_auc_score(y_train, y_train_hat), metrics.roc_auc_score(y_val, y_val_hat)
    records.append(("RandomForest",metrics.roc_auc_score(y_train, y_train_hat),
                                                              metrics.roc_auc_score(y_val, y_val_hat)))    
    from sklearn.ensemble import AdaBoostClassifier
    clf = AdaBoostClassifier()
    clf.fit(x_train, y_train)
    y_train_hat = clf.predict_proba(x_train) [:,1]
    y_val_hat = clf.predict_proba(x_val) [:,1] 
    metrics.roc_auc_score(y_train, y_train_hat), metrics.roc_auc_score(y_val, y_val_hat)
    records.append(("AdaBoost",metrics.roc_auc_score(y_train, y_train_hat),
                                                              metrics.roc_auc_score(y_val, y_val_hat)))
    
    df_metrics = pd.DataFrame.from_records(records, columns=headers).set_index('Model')
    return df_metrics

In [16]:
np_x_full_id = np.array(df_x_full_id)
np_x_full_id = np.delete(np_x_full_id, 0, axis=1)

# PART-3: compute_target
# do training and validation: model one, for the following 7 days 

In [17]:
x_train = np_x_full_id [:,0:8]
y_train = np_x_full_id [:,8].astype('int')
y_train[np.nonzero(y_train)]=1
x_val = np_x_full_id [:,1:9]
y_val = np_x_full_id [:,9].astype('int')
y_val[np.nonzero(y_val)]=1
x_train.shape

(621001, 8)

In [18]:
running_clfs(x_train,y_train,x_val,y_val)



Unnamed: 0_level_0,AUC_training,AUC_validation
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
LogisticRegression,0.868436,0.91901
DecisionTree,0.875901,0.644903
RandomForest,0.869985,0.910144
AdaBoost,0.869409,0.918937


## making predictions for the following 7 days

In [19]:
x_test = np_x_full_id [:,2:10]

x_train = np.append(x_train,x_val,axis=0 )
y_train = np.append(y_train,y_val,axis=0 )

In [22]:
clf = LogisticRegression(random_state=0).fit(x_train, y_train)
y_test_hat_7_log = clf.predict_proba(x_test) [:,1]

clf = tree.DecisionTreeClassifier().fit(x_train, y_train)
y_test_hat_7_tree = clf.predict_proba(x_test) [:,1]

clf = RandomForestClassifier(max_depth=10, n_estimators=3, max_features=1).fit(x_train, y_train)
y_test_hat_7_rf = clf.predict_proba(x_test) [:,1]

clf = AdaBoostClassifier().fit(x_train, y_train)
y_test_hat_7_ada = clf.predict_proba(x_test) [:,1]



## do training and validation: model one, for the following 14 days

In [23]:
x_train = np_x_full_id [:,0:7]
y_train = np.sum(np_x_full_id [:,7:9],axis=1).astype('int')
y_train[np.nonzero(y_train)]=1
x_val = np_x_full_id [:,1:8]
y_val = np.sum(np_x_full_id [:,8:10],axis=1).astype('int')
y_val[np.nonzero(y_val)]=1
x_train.shape

(621001, 7)

In [24]:
running_clfs(x_train,y_train,x_val,y_val)



Unnamed: 0_level_0,AUC_training,AUC_validation
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
LogisticRegression,0.71905,0.856802
DecisionTree,0.726292,0.704576
RandomForest,0.720526,0.851242
AdaBoost,0.719834,0.856563


# PART-4: models
## making predictions for the following 14 days

In [25]:
x_test = np_x_full_id [:,3:10]

x_train = np.append(x_train,x_val,axis=0 )
y_train = np.append(y_train,y_val,axis=0 )

In [26]:
clf = LogisticRegression(random_state=0).fit(x_train, y_train)
y_test_hat_14_log = clf.predict_proba(x_test) [:,1]

clf = tree.DecisionTreeClassifier().fit(x_train, y_train)
y_test_hat_14_tree = clf.predict_proba(x_test) [:,1]

clf = RandomForestClassifier(max_depth=10, n_estimators=3, max_features=1).fit(x_train, y_train)
y_test_hat_14_rf = clf.predict_proba(x_test) [:,1]

clf = AdaBoostClassifier().fit(x_train, y_train)
y_test_hat_14_ada = clf.predict_proba(x_test) [:,1]




# PART-5: save to .csv files (following Kaggle's format)

In [27]:
df_x_full_id['log_7'] =  y_test_hat_7_log
df_x_full_id['log_14'] =  y_test_hat_14_log 
df_x_full_id['tree_7'] =  y_test_hat_7_tree
df_x_full_id['tree_14'] =  y_test_hat_14_tree  
df_x_full_id['rf_7'] =  y_test_hat_7_rf
df_x_full_id['rf_14'] =  y_test_hat_14_rf
df_x_full_id['ada_7'] =  y_test_hat_7_ada
df_x_full_id['ada_14'] =  y_test_hat_14_ada 

In [37]:
df_x_full_id.head(3)

Unnamed: 0,user_id_hash,0,1,2,3,4,5,6,7,8,9,log_7,log_14,tree_7,tree_14,rf_7,rf_14,ada_7,ada_14
0,9943447915df3a45fd6720a026af905b6da6b56a37701b...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.004513,0.009895,0.008114,0.014599,0.009742,0.016438,0.471974,0.476909
1,43f75f8042d3c80c45e222bdd09267f4584684c54d6fae...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004261,0.009284,0.001362,0.005539,0.001374,0.005557,0.468139,0.474432
2,999524249720812f2d8c0390293efd58e1ac84d587a01c...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004261,0.009284,0.001362,0.005539,0.001374,0.005557,0.468139,0.474432


In [29]:
df_sample = pd.read_csv('sample_submission_2.csv', encoding = "UTF-8")

In [30]:
df_sub_full_id = pd.DataFrame.merge( df_sample,
                                 df_sub_full_id, how='left', on='user_id_hash').fillna(0)
df_sub_full_id.head(3)

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days,0,1,2,3,4,5,6,...,8,9,log_7,log_14,tree_7,tree_14,rf_7,rf_14,ada_7,ada_14
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.01,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.004261,0.009284,0.001362,0.005539,0.001374,0.005557,0.468139,0.474432
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.01,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.004261,0.009284,0.001362,0.005539,0.001374,0.005557,0.468139,0.474432
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.01,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.004261,0.009284,0.001362,0.005539,0.001374,0.005557,0.468139,0.474432


In [32]:
df = df_sub_full_id[[ 'user_id_hash','log_7','log_14']]
df.rename(columns={'log_7': 'user_purchase_binary_7_days', 
                   'log_14': 'user_purchase_binary_14_days'},inplace=True)
df.to_csv('sub_log.csv',  encoding='utf-8',index=False)

In [36]:
df = df_sub_full_id[[ 'user_id_hash','tree_7','tree_14']]
df.rename(columns={'tree_7': 'user_purchase_binary_7_days', 
                   'tree_14': 'user_purchase_binary_14_days'},inplace=True)
df.to_csv('sub_tree.csv', encoding='utf-8',index=False)

In [34]:
df = df_sub_full_id[[ 'user_id_hash','rf_7','rf_14']]
df.rename(columns={'rf_7': 'user_purchase_binary_7_days', 
                   'rf_14': 'user_purchase_binary_14_days'},inplace=True)
df.to_csv('sub_rf.csv', encoding='utf-8',index=False)

In [35]:
df = df_sub_full_id[[ 'user_id_hash','ada_7','ada_14']]
df.rename(columns={'ada_7': 'user_purchase_binary_7_days', 
                   'ada_14': 'user_purchase_binary_14_days'},inplace=True)
df.to_csv('sub_ada.csv', encoding='utf-8',index=False)