In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.cross_validation import ShuffleSplit, KFold

import xgboost as xgb

In [2]:
transactions = pd.read_csv('transactions.csv')
customers_gender = pd.read_csv('customers_gender_train.csv')

In [3]:
transactions.head()

Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id
0,39026145,0 10:23:26,4814,1030,-2245.92,
1,39026145,1 10:19:29,6011,7010,56147.89,
2,39026145,1 10:20:56,4829,2330,-56147.89,
3,39026145,1 10:39:54,5499,1010,-1392.47,
4,39026145,2 15:33:42,5499,1010,-920.83,


In [4]:
customers_gender.head()

Unnamed: 0,customer_id,gender
0,75562265,0
1,10928546,1
2,69348468,1
3,84816985,1
4,61009479,0


In [5]:
X1 = transactions.groupby('customer_id') \
                    .apply(lambda x: x[['mcc_code']].unstack().value_counts()) \
                    .unstack() \
                    .fillna(0)

In [6]:
X1.columns = map(lambda x: "mcc_code_"+str(x), X1.columns)
X1.head()

Unnamed: 0_level_0,mcc_code_742,mcc_code_1711,mcc_code_1731,mcc_code_1799,mcc_code_2741,mcc_code_3000,mcc_code_3351,mcc_code_3501,mcc_code_4111,mcc_code_4112,...,mcc_code_8299,mcc_code_8398,mcc_code_8641,mcc_code_8699,mcc_code_8999,mcc_code_9211,mcc_code_9222,mcc_code_9311,mcc_code_9399,mcc_code_9402
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
31385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
X2 = transactions.groupby('customer_id') \
                    .apply(lambda x: x[['tr_type']].unstack().value_counts()) \
                    .unstack() \
                    .fillna(0)

In [8]:
X2.columns = map(lambda x: "tr_type_"+str(x), X2.columns)

In [10]:
X2.head()

Unnamed: 0_level_0,tr_type_1000,tr_type_1010,tr_type_1030,tr_type_1100,tr_type_1110,tr_type_1200,tr_type_1210,tr_type_1310,tr_type_1410,tr_type_1510,...,tr_type_7040,tr_type_7041,tr_type_7044,tr_type_7070,tr_type_7071,tr_type_7074,tr_type_7075,tr_type_8100,tr_type_8145,tr_type_8146
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6815,0.0,42.0,90.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22899,0.0,6.0,47.0,0.0,27.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.0,3.0,0.0,1.0,0.0,0.0,0.0
27914,0.0,0.0,58.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28753,0.0,89.0,25.0,3.0,65.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0
31385,0.0,75.0,125.0,0.0,75.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
X3 = transactions.groupby('customer_id') \
                    .apply(lambda x: x[['amount']].unstack().mean())

In [12]:
X3 = pd.DataFrame(X3, columns=["Mean_amount"])

In [13]:
X = pd.concat([X1,X2,X3], axis=1)

In [14]:
customers_gender = customers_gender.set_index('customer_id')

In [15]:
customers_gender.head()

Unnamed: 0_level_0,gender
customer_id,Unnamed: 1_level_1
75562265,0
10928546,1
69348468,1
84816985,1
61009479,0


In [16]:
Y_tr = customers_gender.loc[X.index].gender
Y_tr = Y_tr.reset_index()
del Y_tr['customer_id']
Y_tr = Y_tr.dropna(0)

In [17]:
X_tr = X.reset_index()
X_tr = X_tr.loc[Y_tr.index].set_index('customer_id')

In [18]:
for itr, ite in ShuffleSplit(X_tr.shape[0], n_iter=4, train_size=0.7, test_size=0.3, random_state=0):
    pass

In [19]:
X_train = X_tr.iloc[itr]
X_val = X_tr.iloc[ite]

Y_train = Y_tr.iloc[itr]
Y_val = Y_tr.iloc[ite]

### Sklearn GradientBoostingClassifier

In [20]:
clf = GradientBoostingClassifier(random_state=13)
clf.fit(X_train, Y_train.values[:, 0])

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=13, subsample=1.0, verbose=0,
              warm_start=False)

In [21]:
predicted = clf.predict_proba(X_val)[:, 1]

In [22]:
auc_score = roc_auc_score(Y_val, predicted)
print "Auc score %s" % str(auc_score) 

Auc score 0.860300049102


## XGboost

In [23]:
%%time
model = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=250, silent=True, \
                          objective='binary:logistic', nthread=24, \
                          subsample=0.6, colsample_bytree=0.6, colsample_bylevel=0.7)
model.fit(X_train, Y_train.gender)

CPU times: user 23.5 s, sys: 318 ms, total: 23.8 s
Wall time: 24.4 s


In [24]:
predicted = model.predict_proba(X_val)[:, 1]

In [25]:
auc_score = roc_auc_score(Y_val, predicted)
print "Auc score %s" % str(auc_score) 

Auc score 0.862324885762


In [26]:
param = {}
param['max_depth'] = 5
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'auc'
param['eta'] = 0.05
param['subsample'] = 0.7
param['colsample_bytree'] = 0.8
param['colsample_bylevel'] = 0.8

numround = 600

In [27]:
Xdatatrain = xgb.DMatrix(X_train, label = Y_train)
Xdatatest = xgb.DMatrix(X_val, label = Y_val)

plst = list(param.items())
watchlist = [(Xdatatrain, 'train'), (Xdatatest, 'eval')]            

bst = xgb.train(plst, Xdatatrain, numround, evals = watchlist, verbose_eval = 10)
y_pred = bst.predict(Xdatatest)

[0]	train-auc:0.776978	eval-auc:0.752292
[10]	train-auc:0.843755	eval-auc:0.812922
[20]	train-auc:0.85895	eval-auc:0.823606
[30]	train-auc:0.871632	eval-auc:0.831486
[40]	train-auc:0.8822	eval-auc:0.838524
[50]	train-auc:0.890477	eval-auc:0.844791
[60]	train-auc:0.898612	eval-auc:0.850778
[70]	train-auc:0.904538	eval-auc:0.854115
[80]	train-auc:0.909746	eval-auc:0.857374
[90]	train-auc:0.914461	eval-auc:0.860119
[100]	train-auc:0.918391	eval-auc:0.862408
[110]	train-auc:0.922153	eval-auc:0.863388
[120]	train-auc:0.925279	eval-auc:0.865129
[130]	train-auc:0.927993	eval-auc:0.866475
[140]	train-auc:0.930011	eval-auc:0.867718
[150]	train-auc:0.932544	eval-auc:0.869073
[160]	train-auc:0.934725	eval-auc:0.869629
[170]	train-auc:0.936677	eval-auc:0.870107
[180]	train-auc:0.938954	eval-auc:0.870889
[190]	train-auc:0.940765	eval-auc:0.871286
[200]	train-auc:0.942377	eval-auc:0.8719
[210]	train-auc:0.943865	eval-auc:0.87257
[220]	train-auc:0.945551	eval-auc:0.873226
[230]	train-auc:0.947186	eva

In [28]:
print "Auc score %s" % str(roc_auc_score(Y_val, y_pred)) 

Auc score 0.876203910384


In [29]:
Xdatatrain = xgb.DMatrix(X_tr, label = Y_tr)
bst = xgb.train(plst, Xdatatrain, numround, verbose_eval = 10)

X_test = X.drop(customers_gender.index)
Xtest = xgb.DMatrix(X_test)

res = bst.predict(Xtest)

In [30]:
result = pd.DataFrame(X_test.index, columns=['customer_id'])
result['gender'] = res

In [31]:
result.head()

Unnamed: 0,customer_id,gender
0,49101,0.35971
1,114348,0.622834
2,127946,0.813793
3,137367,0.717857
4,174467,0.646192


In [32]:
result.to_csv('baseline2_xgb.csv', index=False)

In [None]:
X.shape

In [47]:
from sklearn.svm import SVC

In [48]:
Y_train_svm = Y_train.gender.apply(lambda x : int (x>0))

In [None]:
%%time
svm = SVC(kernel="linear", probability=True)
svm.fit(X_train, list(Y_train_svm)) 

In [50]:
pred = svm.predict_proba(X_val)
y_pred = map(lambda x: x[1], pred)

AttributeError: 'LinearSVC' object has no attribute 'predict_proba'

In [45]:
print "Auc score %s" % str(roc_auc_score(Y_val, y_pred)) 

Auc score 0.509377160744
