## GBDT + FFM

In [1]:
%matplotlib inline
import os
import matplotlib.pyplot as plt
import datetime
import multiprocessing
import numpy as np
import pandas as pd
from scipy import interp
from patsy import dmatrices
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import ParameterGrid
from sklearn.externals.joblib import Parallel, delayed
from sklearn.tree._tree import DTYPE
from sklearn.metrics import roc_curve, auc

In [2]:
train_df = pd.read_csv('data/train.csv')
train_df.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1,39,blue-collar,married,secondary,no,1756,yes,no,cellular,3,apr,939,1,-1,0,unknown,1
1,2,51,entrepreneur,married,primary,no,1443,no,no,cellular,18,feb,172,10,-1,0,unknown,1
2,3,36,management,single,tertiary,no,436,no,no,cellular,13,apr,567,1,595,2,failure,1
3,4,63,retired,married,secondary,no,474,no,no,cellular,25,jan,423,1,-1,0,unknown,1
4,5,31,management,single,tertiary,no,354,no,no,cellular,30,apr,502,1,9,2,success,1


In [3]:
def getDayOfYear(month, day): # ex. month = "mar", day = 15
    return int(datetime.datetime.strptime("{} {}".format(month, day), '%b %d').date().strftime('%j'))

train_df['dayofyear'] = np.vectorize(getDayOfYear)(train_df['month'], train_df['day'])
train_df['dayofweek'] = train_df['dayofyear'] % 7

In [4]:
def getPDayOfWeek(pdays):
    return -1 if pdays == -1 else pdays % 7

train_df['pdayofweek'] = np.vectorize(getPDayOfWeek)(train_df['pdays'])

In [5]:
# 質的変数をダミー変数化
y_, X_ = dmatrices('y ~ age + job + marital + education + default + balance + housing + loan + contact + day + month + dayofyear + dayofweek + duration + campaign + pdays + pdayofweek + previous + poutcome', data=train_df, return_type='dataframe')
X = X_.values
y = y_.y.values.astype(int)

X_.head()

Unnamed: 0,Intercept,job[T.blue-collar],job[T.entrepreneur],job[T.housemaid],job[T.management],job[T.retired],job[T.self-employed],job[T.services],job[T.student],job[T.technician],...,age,balance,day,dayofyear,dayofweek,duration,campaign,pdays,pdayofweek,previous
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.0,1756.0,3.0,93.0,2.0,939.0,1.0,-1.0,-1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,51.0,1443.0,18.0,49.0,0.0,172.0,10.0,-1.0,-1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,36.0,436.0,13.0,103.0,5.0,567.0,1.0,595.0,0.0,2.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,63.0,474.0,25.0,25.0,4.0,423.0,1.0,-1.0,-1.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,31.0,354.0,30.0,120.0,1.0,502.0,1.0,9.0,2.0,2.0


In [9]:
class GbdtFfmClassifier:
    def __init__(self, n_estimators=100, max_depth=3, iter=15, factor=4):
        self.gbdt_model = GradientBoostingClassifier(n_estimators=n_estimators, max_depth=max_depth)
        
        self.iter = iter
        self.factor = factor
        
        self.ffm_path = 'libffm'
        self.train_path = 'tmp/ffm_train'
        self.test_path = 'tmp/ffm_test'
        self.model_path = 'tmp/ffm_model'
        self.predict_path = 'tmp/ffm_prediction'
        
        if not os.path.exists('tmp'):
            os.makedirs('tmp')
        
    def fit(self, X, y):
        self.gbdt_model.fit(X, y)
        X2 = self.getEachTreeDecision(X)
        self.makeFfmFile(self.train_path, X2, y)
        os.system("{}/ffm-train -s {} -k {} -t {} {} {}".format(self.ffm_path, multiprocessing.cpu_count(), self.factor, self.iter, self.train_path, self.model_path))
        return self
    
    def predict_proba(self, X):
        X2 = self.getEachTreeDecision(X)
        self.makeFfmFile(self.test_path, X2, y)
        os.system("{}/ffm-predict {} {} {}".format(self.ffm_path, self.test_path, self.model_path, self.predict_path))
        with open(self.predict_path, 'r') as fr:
            probas_ = np.array(fr.read().split('\n')[:-1]).astype(float)
        return probas_
        
    def makeFfmFile(self, path, X2, y=None):
        with open(path, 'w') as fw:
            for i in range(len(X2)):
                str_list = []
                if y is not None:
                    str_list.append(str(y[i]))
                for j in range(len(X2[i])):
                    str_list.append("{}:{}:1.0".format(j, X2[i][j]))
                fw.write(" ".join(str_list) + "\n")
        
    def getEachTreeDecision(self, X):
        out = Parallel(n_jobs=4)(delayed(self.my_func)(self.gbdt_model.estimators_[i, 0].tree_, X) for i in range(self.gbdt_model.n_estimators))
        return np.transpose(np.array(out))
    
    def my_func(self, tree, X):
            return tree.apply(np.array(X, dtype=DTYPE))

In [7]:
def kFoldAuc(classifier, n_folds=6):
    cv = StratifiedKFold(y, n_folds=n_folds)

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []

    print("Calculating fold: ", end="")
    for i, (train, test) in enumerate(cv):
        print(i+1, end=" ")
        classifier.fit(X[train], y[train])
        probas_ = classifier.predict_proba(X[test])#[:, 1]
        
        fpr, tpr, thresholds = roc_curve(y[test], probas_)
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0

    print("done!", end="\t")

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    print("AUC: {}".format(mean_auc))
    print("")
    return mean_auc

In [10]:
for p in ParameterGrid({'n_estimators': [100, 500], 'max_depth': [7], 'iter': [15, 30], 'factor': [4,6,8]}):
    print('Params: {}'.format(p))
    gbdt_ffm_model = GbdtFfmClassifier(p['n_estimators'], p['max_depth'], p['iter'], p['factor'])
    kFoldAuc(gbdt_ffm_model)

Params: {'n_estimators': 100, 'iter': 15, 'factor': 4, 'max_depth': 7}
Calculating fold: 1 2 3 4 5 6 done!	AUC: 0.9343307046520656

Params: {'n_estimators': 500, 'iter': 15, 'factor': 4, 'max_depth': 7}
Calculating fold: 1 2 3 4 5 6 done!	AUC: 0.9219383501047018

Params: {'n_estimators': 100, 'iter': 30, 'factor': 4, 'max_depth': 7}
Calculating fold: 1 2 3 4 5 6 done!	AUC: 0.9330672827837287

Params: {'n_estimators': 500, 'iter': 30, 'factor': 4, 'max_depth': 7}
Calculating fold: 1 2 3 4 5 6 done!	AUC: 0.9243283496591626

Params: {'n_estimators': 100, 'iter': 15, 'factor': 6, 'max_depth': 7}
Calculating fold: 1 2 3 4 5 6 done!	AUC: 0.9347698790042835

Params: {'n_estimators': 500, 'iter': 15, 'factor': 6, 'max_depth': 7}
Calculating fold: 1 2 3 4 5 6 done!	AUC: 0.9215628242093271

Params: {'n_estimators': 100, 'iter': 30, 'factor': 6, 'max_depth': 7}
Calculating fold: 1 2 3 4 5 6 done!	AUC: 0.9336846728151076

Params: {'n_estimators': 500, 'iter': 30, 'factor': 6, 'max_depth': 7}
Calcu