In [105]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import xgboost
import time
import lasagne
import theano
import theano.tensor as T
from lasagne.layers import *

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import KFold
from sklearn.preprocessing import minmax_scale
from sklearn.cross_validation import train_test_split
from sklearn.mixture import GMM
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

from scipy.sparse import csc_matrix

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 80) 
pd.set_option('display.max_rows', 100) 
%matplotlib inline

In [106]:
def normalize_reg(x):
    if type(x) == type(1.0):
        return u''
    if x == 98:
        return 
    x = x.upper()
    x = x.replace(u'РЕСПУБЛИКА', u'')
    x = x.replace(u'РЕСП', u'')
    x = x.replace(u'ОБЛАСТЬ', u'')
    x = x.replace(u'ОБЛ', u'')
    x = x.replace(u'КРАЙ', u'')
    x = x.replace(u'Г ', u'')
    x = x.replace(u' Г', u'')
    x = x.replace(u'Г.', u'')
    x = x.replace(u'АО ', u'')
    x = x.replace(u' АО', u'')
    x = x.replace(u'АO', '')
    x = x.replace(u'ЧУВАШСКАЯ', u'')
    x = x.replace(u'(', u'')
    x = x.replace(u')', u'')
    x = x.replace(u'/', u'')
    x = x.replace(u' ', u'')
    x = x.replace(u'.', u'')
    x = x.replace(u'-', u'')

    x = x.replace(u'ЧУВАШСКАЯ', u'')
    x = x.replace(u'ОРЁЛ', u'')
    x = x.replace(u'СЕВЕРНАЯ', u'СЕВ')
    if u'ЕВРЕЙ' in x:
        x = u'ЕВРЕЙСКАЯАО'
    if u'КАМЧ' in x:
        x = u'КАМЧАТКА'
    if u'ХАНТЫ' in x:
        x = u'ХМАО'
    if u'САХА' in x:
        x = u'ЯКУТИЯ'
    if u'АЛТАЙ' in x:
        x = u'АЛТАЙ'
    if u'МОСКОВСКИЙ' in x:
        x = u'МОСКОВСКАЯ'
    if u'МОСКВОС' in x:
        x = u'МОСКОВСКАЯ'
    if u'МОСКОВСКАЯ' in x:
        x = u'МОСКОВСКАЯ'
    if u'РОССИЯ' in x:
        x = u'МОСКВА'
    if u'ЧЕЛЯБ' in x:
        x = u'ЧЕЛЯБИНСК'
    return x

In [107]:
def preprocessing(train, test):
    
    Xtest = test.copy()
    X = train.drop(['open_account_flg'], axis=1).copy()
    y = train['open_account_flg']
    
    for col in ['credit_sum', 'score_shk']:
        X[col] = X[col].apply(lambda x: float(x.replace(',', '.')))
        Xtest[col] = Xtest[col].apply(lambda x: float(x.replace(',', '.')))
    
    Xtest['conc'] = test['gender'] + test['marital_status'] + \
                    test['job_position'] + test['education']
    X['conc'] = train['gender'] + train['marital_status'] + \
                train['job_position'] + train['education']
        
    encoder = LabelEncoder()
    for col in ['gender', 'marital_status', 'education', 'job_position']:
        X[col] = encoder.fit_transform(X[col])
        Xtest[col] = encoder.transform(Xtest[col])
    
    for col in ['living_region', 'conc', 'living_region2']:
        r1 = X[col].fillna('na')
        r2 = Xtest[col].fillna('na')
        encoder.fit(pd.concat([r1, r2]))
        X[col] = encoder.transform(r1)
        Xtest[col] = encoder.transform(r2)
    
    X.loc[X['monthly_income'].isnull(), 'monthly_income'] = X['monthly_income'].median()
    
    X = X.fillna(-1)
    Xtest = Xtest.fillna(-1)
    
    for col in ['credit_sum', 'credit_month', 'living_region', 'living_region2']:
        feature = pd.concat([X[col], Xtest[col]])
        feature_map = feature.groupby(feature).apply(len)
        X['frequency_' + col] = X[col].map(feature_map)
        Xtest['frequency_' + col] = Xtest[col].map(feature_map)
    
    X['credit_sum_by_month'] = X.credit_sum / X.credit_month
    Xtest['credit_sum_by_month'] = Xtest.credit_sum / Xtest.credit_month
        
    return X, Xtest, y

In [147]:
def create_quantile(X, col, n):
    X[col + '_quantile'] = pd.qcut(X[col], n)
    return pd.get_dummies(X, columns=[col + '_quantile'])
def create_linear(X):
    columns = ['marital_status', 'job_position', 'tariff_id', 
               'education', 'living_region', 'living_region2', 
               'age', 'credit_month', 'overdue_credit_count', 'credit_count']
    X_linear = pd.get_dummies(X.drop('conc', axis=1), columns=columns)
        
    X_linear = create_quantile(X_linear, 'credit_sum', 20)
    X_linear = create_quantile(X_linear, 'score_shk', 30)
    X_linear = create_quantile(X_linear, 'credit_sum_by_month', 50)
    X_linear = create_quantile(X_linear, 'frequency_credit_sum', 6)
    
    X_linear.credit_sum_by_month = np.log(X_linear.credit_sum_by_month)
    X_linear.frequency_credit_sum = np.log(X_linear.frequency_credit_sum)
    
    X_linear = X_linear.drop(['frequency_credit_month', 'frequency_living_region', 'frequency_living_region2'], axis=1)
    return minmax_scale(X_linear)

In [6]:
data_train = pd.read_csv("../data/credit_train.csv", encoding='cp1251', sep=';', index_col='client_id')
data_test = pd.read_csv("../data/credit_test.csv", encoding='cp1251', sep=';', index_col='client_id')

data_test['living_region2'] = data_test['living_region'].apply(normalize_reg)
data_train['living_region2'] = data_train['living_region'].apply(normalize_reg)

In [152]:
X, Xtest, y = preprocessing(data_train, data_test)
X_tmp = create_linear(pd.concat([X, Xtest]))
X_linear = X_tmp[:len(X)]
Xtest_linear = X_tmp[len(X):]

In [115]:
def iterate_minibatches(*arrays,**kwargs):
    
    batchsize=kwargs.get("batchsize",100)
    shuffle = kwargs.get("shuffle",True)
    
    if shuffle:
        indices = np.arange(len(arrays[0]))
        np.random.shuffle(indices)
    for start_idx in range(0, len(arrays[0]) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield [arr[excerpt] for arr in arrays]

def proba_from_nn(Xtrain, Xtest, ytrain):
    
    ytrain = np.array(ytrain).astype('int32')
    input_X = T.matrix()
    target_y = T.vector(dtype='int32')
    input_shape = (None, Xtrain.shape[1])
    
    nn = InputLayer(shape =input_shape, input_var=input_X)
    nn = DenseLayer(nn, num_units=70, nonlinearity=lasagne.nonlinearities.sigmoid)
    nn = DenseLayer(nn, num_units=20, nonlinearity=lasagne.nonlinearities.sigmoid)
    nn = DenseLayer(nn, num_units=2, nonlinearity=lasagne.nonlinearities.softmax)
    
    y_predicted     = get_output(nn)
    all_weights     = get_all_params(nn, trainable=True)
    loss            = lasagne.objectives.categorical_crossentropy(y_predicted, target_y).mean()
    updates         = lasagne.updates.adamax(loss, all_weights)
    train_fun       = theano.function([input_X, target_y],loss, updates= updates)
    loss_fun        = theano.function([input_X, target_y], loss)
    pred            = theano.function([input_X], y_predicted)
    
    batch_size = 100
    for epoch in range(45):
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(Xtrain, ytrain, batchsize=batch_size):
            inputs, targets = batch
            train_err_batch = train_fun(inputs, targets)
            train_err += train_err_batch
            train_batches += 1
    return pred(Xtrain)[:, 1], pred(Xtest)[:, 1]

In [400]:
def get_mapping(X, Xtest, y, num=1):
    all_mapping = {}
    y = y.values
    for col in X.columns:
        mapping = {}
        values = np.unique(X[col])
        index = dict(zip(values, range(len(values))))
        val_sum = np.zeros(len(values))
        val_n = np.zeros(len(values))
        x = X[col].values
        for i in range(len(y)):
            ind = index[x[i]]
            val_sum[ind] += y[i]
            val_n[ind] += 1
        for val in values:
            ind = index[val]
            if val_n[ind] < num:
                mapping[val] = 1
            else:
                mapping[val] = val_sum[ind] / val_n[ind]
        values = np.unique(Xtest[col])
        for val in values:
            if not val in mapping:
                mapping[val] = 1
        tmp = sorted([[i, j] for j, i in mapping.items()])
        all_mapping[col] = dict(zip([j for i, j in tmp], range(len(mapping))))
    return all_mapping

def apply_mapping(mapping, X, col_for_mapping):
    Xnew = X.copy()
    for col in col_for_mapping:
        Xnew[col] = X[col].map(mapping[col])
    return Xnew

In [397]:
cv = KFold(len(X), n_folds=5, shuffle=True, random_state=10)
columns = ['marital_status', 'job_position', 'tariff_id', 'education',   'conc']

In [382]:
%%time
proba = []
proba_all = []
for i in range(5):
    proba_new = pd.Series(index=X.index)
    for train, val in cv:

        Xtrain = X.iloc[train]
        Xval = X.iloc[val]
        ytrain = y.iloc[train]
        model = xgboost.XGBClassifier(n_estimators=820, 
                                       max_depth=6, 
                                       colsample_bytree=0.7, 
                                      subsample=0.9,
                                       learning_rate=0.03, 
                                       reg_alpha=1,
                                       seed=i)

        mapping = get_mapping(Xtrain[columns], Xval, ytrain, num=300)
        XtrainM = apply_mapping(mapping, Xtrain, columns)
        XvalM = apply_mapping(mapping, Xval, columns)

        model.fit(XtrainM,  ytrain)
        proba_new.iloc[val] = model.predict_proba(XvalM)[:,1]
    proba_all.append(proba_new)
    print(roc_auc_score(y, proba_new))
proba.append(np.mean(proba_all, axis=0))

0.775446514103
0.775337769289
0.775270838052
0.775396983867
0.775719968874
CPU times: user 1h 7min 40s, sys: 2min 46s, total: 1h 10min 27s
Wall time: 13min 31s


In [383]:
%%time
proba_all = []
for i in range(5):
    proba_new = pd.Series(index=X.index)
    for train, val in cv:

        Xtrain = X.iloc[train]
        Xval = X.iloc[val]
        ytrain = y.iloc[train]
        model = xgboost.XGBClassifier(n_estimators=700, 
                                       max_depth=7, 
                                       colsample_bytree=0.7, 
                                      subsample=0.9,
                                       learning_rate=0.03, 
                                       reg_alpha=1,
                                       seed=i)

        mapping = get_mapping(Xtrain[columns], Xval, ytrain, num=300)
        XtrainM = apply_mapping(mapping, Xtrain, columns)
        XvalM = apply_mapping(mapping, Xval, columns)

        model.fit(XtrainM,  ytrain)
        proba_new.iloc[val] = model.predict_proba(XvalM)[:,1]
    proba_all.append(proba_new)
    print(roc_auc_score(y, proba_new))
proba.append(np.mean(proba_all, axis=0))

0.775368674236
0.775343930934
0.775441507929
0.775396816553
0.775426084544
CPU times: user 1h 8min 34s, sys: 2min 57s, total: 1h 11min 32s
Wall time: 13min 35s


In [384]:
%%time
proba_all = []
for i in range(5):
    proba_new = pd.Series(index=X.index)
    for train, val in cv:

        Xtrain = X.iloc[train]
        Xval = X.iloc[val]
        ytrain = y.iloc[train]
        model = xgboost.XGBClassifier(n_estimators=700, 
                                       max_depth=7, 
                                       colsample_bytree=0.7, 
                                      subsample=0.9,
                                       learning_rate=0.03, 
                                       reg_alpha=1,
                                       seed=i)

        #mapping = get_mapping(Xtrain[columns], Xval, ytrain, num=300)
        #XtrainM = apply_mapping(mapping, Xtrain, columns)
        #XvalM = apply_mapping(mapping, Xval, columns)

        model.fit(Xtrain,  ytrain)
        proba_new.iloc[val] = model.predict_proba(Xval)[:,1]
    proba_all.append(proba_new)
    print(roc_auc_score(y, proba_new))
proba.append(np.mean(proba_all, axis=0))

0.774181498017
0.774213559973
0.774391893655
0.774302265903
0.774462212674
CPU times: user 1h 8min 36s, sys: 2min 57s, total: 1h 11min 34s
Wall time: 13min 30s


In [385]:
%%time
proba_new = pd.Series(index=X.index)
nn_proba = []
for train, val in cv:
    nn_test = []
    nn_train = []
    for i in range(5):
        p_train, p_test = proba_from_nn(X_linear[train], X_linear[val], y.iloc[train])
        nn_train.append(p_train)
        nn_test.append(p_test)     
    nn_proba.append((np.mean(nn_train, axis=0), np.mean(nn_test, axis=0)))
    proba_new.iloc[val] = nn_proba[-1][1]
proba.append(proba_new)
print(roc_auc_score(y, proba_new))

0.77181444496
CPU times: user 1h 36min 40s, sys: 23min 13s, total: 1h 59min 53s
Wall time: 33min 20s


In [403]:
%%time
proba_all = []

for i in range(5):
    proba_new = pd.Series(index=X.index)
    model = xgboost.XGBClassifier(n_estimators=500, 
                           max_depth=7, 
                           colsample_bytree=0.3, 
                           learning_rate=0.03, 
                           reg_alpha=4,
                           seed=i)
    for j, (train, val) in enumerate(cv):

        Xtrain = X.iloc[train]
        Xval = X.iloc[val]
        ytrain = y.iloc[train]
        mapping = get_mapping(Xtrain[columns], Xval, ytrain, num=300)
        XtrainM = apply_mapping(mapping, Xtrain, columns)
        XvalM = apply_mapping(mapping, Xval, columns)
        XtrainM['y'] = nn_proba[j][0]
        XvalM['y'] = nn_proba[j][1]
        
        model.fit(XtrainM,  ytrain)
        proba_new.iloc[val] = model.predict_proba(XvalM)[:,1]
    proba_all.append(proba_new)
proba.append(np.mean(proba_all, axis=0))

CPU times: user 28min 9s, sys: 1min 19s, total: 29min 28s
Wall time: 8min 44s


In [388]:
for p in proba:
    print(roc_auc_score(y, p))

0.775944419116
0.776075341713
0.775007122488
0.77181444496
0.775429455416


In [159]:
def create_blending_coefficient(y, proba, max_iter=200, verbose=False):
    B = np.transpose(proba)
    n = len(proba)
    a = np.array([20]*n)
    roc_old = roc_auc_score(y, (B * a).mean(axis=1))
    for i in range(max_iter):
        a[i % n] += 1
        roc_new = roc_auc_score(y, (B * a).mean(axis=1))
        if roc_new < roc_old:
            a[i % n] += -2
            roc_new = roc_auc_score(y, (B * a).mean(axis=1))
            if roc_new <= roc_old:
                a[i % n] += 1
                roc_new = roc_old
        roc_old = roc_new
        if verbose:
            print(a, roc_old)
    a[a < 0] = 0
    print(roc_old, a)
    return a / a.sum()

In [391]:
a = create_blending_coefficient(y, proba, verbose=True, max_iter=500)

[21 20 20 20 20] 0.777839828941
[21 21 20 20 20] 0.777843922278
[21 21 21 20 20] 0.777845559708
[21 21 21 19 20] 0.777849339938
[21 21 21 19 19] 0.777851128955
[22 21 21 19 19] 0.777851385779
[22 22 21 19 19] 0.77785294777
[22 22 22 19 19] 0.777853004763
[22 22 22 19 19] 0.777853004763
[22 22 22 19 18] 0.777853585809
[21 22 22 19 18] 0.777854498409
[21 23 22 19 18] 0.777855567563
[21 23 21 19 18] 0.777855868373
[21 23 21 19 18] 0.777855868373
[21 23 21 19 17] 0.777856246514
[20 23 21 19 17] 0.777857768775
[20 24 21 19 17] 0.777858228504
[20 24 20 19 17] 0.777858259011
[20 24 20 20 17] 0.777858303234
[20 24 20 20 16] 0.777858937962
[19 24 20 20 16] 0.777859818873
[19 25 20 20 16] 0.777861087621
[19 25 21 20 16] 0.777861261911
[19 25 21 20 16] 0.777861261911
[19 25 21 20 16] 0.777861261911
[18 25 21 20 16] 0.77786192691
[18 26 21 20 16] 0.777863277009
[18 26 20 20 16] 0.777863341333
[18 26 20 20 16] 0.777863341333
[18 26 20 20 15] 0.777863594137
[17 26 20 20 15] 0.777864484034
[17 27 20 

In [373]:
%%time
proba_test = []
proba_all = []

mapping = get_mapping(X[columns], Xtest, y, num=300)
XM = apply_mapping(mapping, X, columns)
XtestM = apply_mapping(mapping, Xtest, columns)
    
for i in range(30):
    
    model = xgboost.XGBClassifier(n_estimators=820, 
                                       max_depth=6, 
                                       colsample_bytree=0.7, 
                                      subsample=0.9,
                                       learning_rate=0.03, 
                                       reg_alpha=1,
                                       seed=i)
    model.fit(XM, y)
    proba_all.append(model.predict_proba(XtestM)[:,1])
proba_test.append(np.mean(proba_all, axis=0))

CPU times: user 1h 41min 37s, sys: 3min 18s, total: 1h 44min 56s
Wall time: 1h 28min 11s


In [374]:
%%time
proba_all = []
    
for i in range(30):
    
    model = xgboost.XGBClassifier(n_estimators=700, 
                                       max_depth=7, 
                                       colsample_bytree=0.7, 
                                      subsample=0.9,
                                       learning_rate=0.03, 
                                       reg_alpha=1,
                                       seed=i)
    model.fit(XM, y)
    proba_all.append(model.predict_proba(XtestM)[:,1])
proba_test.append(np.mean(proba_all, axis=0))

CPU times: user 1h 41min 19s, sys: 3min 26s, total: 1h 44min 45s
Wall time: 19min 41s


In [375]:
%%time
proba_all = []
    
for i in range(30):
    
    model = xgboost.XGBClassifier(n_estimators=700, 
                                       max_depth=7, 
                                       colsample_bytree=0.7, 
                                      subsample=0.9,
                                       learning_rate=0.03, 
                                       reg_alpha=1,
                                       seed=i)
    model.fit(X, y)
    proba_all.append(model.predict_proba(Xtest)[:,1])
proba_test.append(np.mean(proba_all, axis=0))

CPU times: user 1h 41min 16s, sys: 3min 21s, total: 1h 44min 38s
Wall time: 20min 9s


In [376]:
%%time


nn_test = []
nn_train = []
for i in range(30):
    p_train, p_test = proba_from_nn(X_linear, Xtest_linear, y)
    nn_train.append(p_train)
    nn_test.append(p_test)     
    
nn_train = np.mean(nn_train, axis=0)
nn_test = np.mean(nn_test, axis=0)
proba_test.append(nn_test)

CPU times: user 6h 38min 2s, sys: 2h 17min 10s, total: 8h 55min 12s
Wall time: 3h 3min 26s


In [378]:
%%time
proba_all = []
XM['y'] = nn_train
XtestM['y'] = nn_test
    
for i in range(30):
    model = xgboost.XGBClassifier(n_estimators=350, 
                               max_depth=7, 
                               colsample_bytree=0.3, 
                               learning_rate=0.03, 
                               reg_alpha=4,
                               seed=i)
    model.fit(XM, y)
    proba_all.append(model.predict_proba(XtestM)[:,1])
proba_test.append(np.mean(proba_all, axis=0))

CPU times: user 27min 30s, sys: 1min 21s, total: 28min 52s
Wall time: 6min 21s


In [401]:
submission = pd.DataFrame(Xtest.index)
submission.columns = ['_ID_']
submission['_VAL_'] = a.dot(proba_test)
submission.to_csv('A.csv', index=False)