In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.cross_validation import ShuffleSplit, KFold

import xgboost as xgb

In [2]:
transactions = pd.read_csv('transactions.csv')
customers_gender = pd.read_csv('customers_gender_train.csv')

In [21]:
transactions.head()

Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id
0,39026145,0 10:23:26,4814,1030,-200.0,
1,39026145,1 10:19:29,6011,7010,5000.0,
2,39026145,1 10:20:56,4829,2330,-5000.0,
3,39026145,1 10:39:54,5499,1010,-124.0,
4,39026145,2 15:33:42,5499,1010,-82.0,


In [15]:
gr_amount = transactions.groupby("amount").count().sort_values("customer_id", ascending=False)

In [16]:
gr_amount.head()

Unnamed: 0_level_0,customer_id,tr_datetime,mcc_code,tr_type,term_id
amount,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-2245.92,395265,395265,395265,395265,226809
-1122.96,201159,201159,201159,201159,114635
-11229.58,196764,196764,196764,196764,114181
-22459.16,194881,194881,194881,194881,112800
-4491.83,174184,174184,174184,174184,100088


1 ruble = 11.2296

In [18]:
transactions.amount = transactions.amount.apply(lambda x: round(x/11.2296))

In [75]:
customers_gender.head()

Unnamed: 0,customer_id,gender
0,75562265,0
1,10928546,1
2,69348468,1
3,84816985,1
4,61009479,0


In [23]:
X1 = transactions.groupby('customer_id') \
                    .apply(lambda x: x[['mcc_code']].unstack().value_counts()) \
                    .unstack() \
                    .fillna(0)

In [24]:
X1.columns = map(lambda x: "mcc_code_"+str(x), X1.columns)
X1.head()

Unnamed: 0_level_0,mcc_code_742,mcc_code_1711,mcc_code_1731,mcc_code_1799,mcc_code_2741,mcc_code_3000,mcc_code_3351,mcc_code_3501,mcc_code_4111,mcc_code_4112,...,mcc_code_8299,mcc_code_8398,mcc_code_8641,mcc_code_8699,mcc_code_8999,mcc_code_9211,mcc_code_9222,mcc_code_9311,mcc_code_9399,mcc_code_9402
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
31385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
X2 = transactions.groupby('customer_id') \
                    .apply(lambda x: x[['tr_type']].unstack().value_counts()) \
                    .unstack() \
                    .fillna(0)

In [26]:
X2.columns = map(lambda x: "tr_type_"+str(x), X2.columns)

In [27]:
X2.head()

Unnamed: 0_level_0,tr_type_1000,tr_type_1010,tr_type_1030,tr_type_1100,tr_type_1110,tr_type_1200,tr_type_1210,tr_type_1310,tr_type_1410,tr_type_1510,...,tr_type_7040,tr_type_7041,tr_type_7044,tr_type_7070,tr_type_7071,tr_type_7074,tr_type_7075,tr_type_8100,tr_type_8145,tr_type_8146
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6815,0.0,42.0,90.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22899,0.0,6.0,47.0,0.0,27.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.0,3.0,0.0,1.0,0.0,0.0,0.0
27914,0.0,0.0,58.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28753,0.0,89.0,25.0,3.0,65.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0
31385,0.0,75.0,125.0,0.0,75.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
X3 = transactions.groupby('customer_id') \
                    .apply(lambda x: x[['amount']].unstack().mean())
    
X3 = pd.DataFrame(X3, columns=["Mean_amount"])

In [56]:
X4 = transactions.groupby('customer_id') \
                    .apply(lambda x: x[['amount']].unstack().min())
    
X4 = pd.DataFrame(X4, columns=["Min_amount"])

In [59]:
X5 = transactions.groupby('customer_id') \
                    .apply(lambda x: x[['amount']].unstack().max())
    
X5 = pd.DataFrame(X5, columns=["Max_amount"])

In [60]:
X6 = transactions.groupby('customer_id') \
                    .apply(lambda x: x[['amount']].unstack().std())
    
X6 = pd.DataFrame(X6, columns=["Std_amount"])

In [61]:
X = pd.concat([X1,X2,X3,X4,X5,X6], axis=1)

In [72]:
X = X.fillna(0)

In [76]:
customers_gender = customers_gender.set_index('customer_id')

In [77]:
customers_gender.head()

Unnamed: 0_level_0,gender
customer_id,Unnamed: 1_level_1
75562265,0
10928546,1
69348468,1
84816985,1
61009479,0


In [78]:
Y_tr = customers_gender.loc[X.index].gender
Y_tr = Y_tr.reset_index()
del Y_tr['customer_id']
Y_tr = Y_tr.dropna(0)

In [79]:
X_tr = X.reset_index()
X_tr = X_tr.loc[Y_tr.index].set_index('customer_id')

In [80]:
for itr, ite in ShuffleSplit(X_tr.shape[0], n_iter=4, train_size=0.7, test_size=0.3, random_state=0):
    pass

In [81]:
X_train = X_tr.iloc[itr]
X_val = X_tr.iloc[ite]

Y_train = Y_tr.iloc[itr]
Y_val = Y_tr.iloc[ite]

### Sklearn GradientBoostingClassifier

In [82]:
clf = GradientBoostingClassifier(random_state=13)
clf.fit(X_train, Y_train.values[:, 0])

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=13, subsample=1.0, verbose=0,
              warm_start=False)

In [83]:
predicted = clf.predict_proba(X_val)[:, 1]

In [84]:
auc_score = roc_auc_score(Y_val, predicted)
print "Auc score %s" % str(auc_score) 

Auc score 0.863536781506


## XGboost

In [85]:
%%time
model = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=250, silent=True, \
                          objective='binary:logistic', nthread=24, \
                          subsample=0.6, colsample_bytree=0.6, colsample_bylevel=0.7)
model.fit(X_train, Y_train.gender)

CPU times: user 23.8 s, sys: 435 ms, total: 24.3 s
Wall time: 24.9 s


In [86]:
predicted = model.predict_proba(X_val)[:, 1]

In [87]:
auc_score = roc_auc_score(Y_val, predicted)
print "Auc score %s" % str(auc_score) 

Auc score 0.866569026074


In [88]:
param = {}
param['max_depth'] = 5
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'auc'
param['eta'] = 0.05
param['subsample'] = 0.7
param['colsample_bytree'] = 0.8
param['colsample_bylevel'] = 0.8

numround = 600

In [89]:
Xdatatrain = xgb.DMatrix(X_train, label = Y_train)
Xdatatest = xgb.DMatrix(X_val, label = Y_val)

plst = list(param.items())
watchlist = [(Xdatatrain, 'train'), (Xdatatest, 'eval')]            

bst = xgb.train(plst, Xdatatrain, numround, evals = watchlist, verbose_eval = 10)
y_pred_xgb = bst.predict(Xdatatest)

[0]	train-auc:0.779976	eval-auc:0.752391
[10]	train-auc:0.847361	eval-auc:0.813838
[20]	train-auc:0.860297	eval-auc:0.822576
[30]	train-auc:0.872389	eval-auc:0.832364
[40]	train-auc:0.883247	eval-auc:0.839739
[50]	train-auc:0.890146	eval-auc:0.844347
[60]	train-auc:0.898731	eval-auc:0.848924
[70]	train-auc:0.905589	eval-auc:0.853967
[80]	train-auc:0.911341	eval-auc:0.857442
[90]	train-auc:0.916414	eval-auc:0.860341
[100]	train-auc:0.920956	eval-auc:0.863877
[110]	train-auc:0.924446	eval-auc:0.86595
[120]	train-auc:0.927173	eval-auc:0.867362
[130]	train-auc:0.930266	eval-auc:0.868896
[140]	train-auc:0.933039	eval-auc:0.869721
[150]	train-auc:0.935625	eval-auc:0.870575
[160]	train-auc:0.93771	eval-auc:0.871188
[170]	train-auc:0.939455	eval-auc:0.871646
[180]	train-auc:0.941017	eval-auc:0.872259
[190]	train-auc:0.943381	eval-auc:0.872724
[200]	train-auc:0.944885	eval-auc:0.872714
[210]	train-auc:0.947182	eval-auc:0.873231
[220]	train-auc:0.948562	eval-auc:0.873601
[230]	train-auc:0.949838

In [90]:
print "Auc score %s" % str(roc_auc_score(Y_val, y_pred_xgb)) 

Auc score 0.877219929954


### Fitting xgb on all train set

In [91]:
Xdatatrain = xgb.DMatrix(X_tr, label = Y_tr)
bst = xgb.train(plst, Xdatatrain, numround, verbose_eval = 10)

X_test = X.drop(customers_gender.index)
Xtest = xgb.DMatrix(X_test)

res_xgb = bst.predict(Xtest)

In [92]:
result = pd.DataFrame(X_test.index, columns=['customer_id'])
result['gender'] = res_xgb

In [93]:
result.head()

Unnamed: 0,customer_id,gender
0,49101,0.373475
1,114348,0.711062
2,127946,0.807596
3,137367,0.682659
4,174467,0.7265


In [31]:
result.to_csv('baseline2_xgb.csv', index=False)

In [None]:
X.shape

## Try SVM

In [None]:
"""from sklearn.svm import SVC
Y_train_svm = Y_train.gender.apply(lambda x : int (x>0))

svm = SVC(kernel="linear", probability=True)
svm.fit(X_train, list(Y_train_svm)) 
pred = svm.predict_proba(X_val)
y_pred = map(lambda x: x[1], pred)
print "Auc score %s" % str(roc_auc_score(Y_val, y_pred)) """

## Try logreg

In [94]:
from sklearn.linear_model import LogisticRegression

In [119]:
logreg = LogisticRegression(max_iter=300, n_jobs=-1, C=1.0)
logreg.fit(X_train, Y_train.gender)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=300, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [120]:
pred_logreg = logreg.predict_proba(X_val)
y_pred_logreg = map(lambda x: x[1], pred_logreg)
print "Auc score %s" % str(roc_auc_score(Y_val, y_pred_logreg)) 

Auc score 0.82196061558


### Fit logreg on all train set

In [121]:
logreg.fit(X_tr, Y_tr.gender)
pred_logreg = logreg.predict_proba(X_test)
res_logreg = map(lambda x: x[1], pred_logreg)

In [153]:
result = pd.DataFrame(X_test.index, columns=['customer_id'])
result['gender'] = res_logreg
result.head()

Unnamed: 0,customer_id,gender
0,49101,0.4645
1,114348,0.868011
2,127946,0.525154
3,137367,0.407955
4,174467,0.50247


In [154]:
result.to_csv('baseline_logreg.csv', index=False)

## Try composition on validation

In [129]:
y_pred_logreg = np.array(y_pred_logreg)
y_pred_rf = np.array(y_pred_rf)
y_pred_knn = np.array(y_pred_knn)

In [181]:
koef_xgb = 0.73
koef_rf = 0.2
koef_knn = 0.00

In [182]:
y_pred_comp = koef_xgb*y_pred_xgb + koef_rf*y_pred_rf + koef_knn*y_pred_knn + (1-koef_xgb-koef_rf-koef_knn)*y_pred_logreg

In [183]:
print "Auc score %s" % str(roc_auc_score(Y_val, y_pred_comp)) 

Auc score 0.877883810827


### Make composition on all train set

In [184]:
res_logreg = np.array(res_logreg)
res_rf = np.array(res_rf)

koef_xgb = 0.73
koef_rf = 0.2

res_xgb_logreg_rf = koef_xgb*res_xgb + koef_rf*res_rf + (1-koef_xgb-koef_rf)*res_logreg

In [185]:
result = pd.DataFrame(X_test.index, columns=['customer_id'])
result['gender'] = res_xgb_logreg_rf

In [186]:
result.head()

Unnamed: 0,customer_id,gender
0,49101,0.412155
1,114348,0.703042
2,127946,0.75622
3,137367,0.647503
4,174467,0.685582


In [187]:
result.to_csv('compose_xgb0.73_rf0.2_logreg.csv', index=False)

## KNN

In [122]:
from sklearn.neighbors import KNeighborsClassifier

In [123]:
knn = KNeighborsClassifier(n_neighbors=50, weights='distance', p=1)

knn.fit(X_train, Y_train.gender)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=50, p=1,
           weights='distance')

In [124]:
pred_knn = knn.predict_proba(X_val)
y_pred_knn = map(lambda x: x[1], pred_knn)
print "Auc score %s" % str(roc_auc_score(Y_val, y_pred_knn)) 

Auc score 0.594748013368


## RF

In [125]:
from sklearn.ensemble import RandomForestClassifier

In [126]:
rf = RandomForestClassifier(n_estimators=500, min_samples_split=40, min_samples_leaf=10, n_jobs=-1, oob_score=True)
rf.fit(X_train, Y_train.gender)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=10, min_samples_split=40,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [127]:
pred_rf = rf.predict_proba(X_val)
y_pred_rf = map(lambda x: x[1], pred_rf)
print "Auc score %s" % str(roc_auc_score(Y_val, y_pred_rf)) 

Auc score 0.85575888473


In [128]:
rf.fit(X_tr, Y_tr.gender)
pred_rf = rf.predict_proba(X_test)
res_rf = map(lambda x: x[1], pred_rf)

## Ideas

1. Данные по сумме транзакций монотонно преобразованы из исходных! можно догадаться (100, 1000, 10000)?
2. Даты откачены по времени
3. Снятие налички (паттерн 7-дневный) --> восстановить дни (1 января, к-рое пришлось на чт!!) - 153 день действительно 1 января - с форума
4. Внешние данные за этот год?  2015?

"Проверьте, пожалуйста анонимизацию данных. Вызывает сомнения владелец карты 70780820 - он как бы делает одну транзакцию в 6 минут... Или вот эти двое 45479973 и 90796706 - они раз в 20 минут покупают-продают. Их можно считать выбросами, но они суммарно отъедают от файла данных больше 200 000 транзакций. Либо это мошенники, либо это бесполые роботы.

Такие клиенты - источник идей, например, для участников Startup Challenge."

5. ARIMA !