In [12]:
#encoding=utf-8
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer

import pickle

import sys
sys.path.append('../CutDebt/')
sys.path.append('../IDClassifier/')
sys.path.append('../IfKnowDebtor/')
sys.path.append('../Installment/')
sys.path.append('../SetDueDay/')
sys.path.append('../WillingToPay/')

from CutDebt_py import CutDebt
from IDClassifier_py import IDClassifier
from IfKnowDebtor_py import IfKnowDebtor
from Installment_py import Installment
from SetDueDay_py import SetDueDay
from WillingToPay_py import WillingToPay

In [14]:
model_list = ['CutDebt','IDClassifier','IfKnowDebtor','Installment','SetDueDay','WillingToPay']
func_list = [CutDebt,IDClassifier,IfKnowDebtor,Installment,SetDueDay,WillingToPay]

ind = 0
for each_model in model_list:
    
    # load data
    path = '../../data/{}/'
    data = pd.read_csv(path.format(each_model) + 'cleaned_mock_up_data.csv', encoding='utf8')
    
    # get tfidf
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer.fit(data.split_text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(data.split_text)

    # linear svc
    l_svc = LinearSVC()
    clf = CalibratedClassifierCV(l_svc) 
    clf.fit(phrase, data.label)
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, data.label)
    
    # lightGBM
    le = preprocessing.LabelEncoder()
    le.fit(data.label.values)
    onelabels = le.transform(data.label.values)
    multicoder = MultiLabelBinarizer()
    lables = multicoder.fit_transform([data.label.values])
    if each_model == 'WillingToPay':
        params = {
        'learning_rate': 0.2,
        'num_iterations':1000,
        'application': 'multiclassova',
        'num_class': 4,
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'multi_error',
        'data_random_seed': 2,
    #     'bagging_fraction': 0.8,
    #     'feature_fraction': 0.6,
        'nthread': 4,
        'lambda_l1': 1,
        'lambda_l2': 1,
        'early_stopping_round':200
        } 
    else:        
        params = {
        'learning_rate': 0.2,
        'num_iterations':1000,
        'application': 'multiclassova',
        'num_class': 3,
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'multi_error',
        'data_random_seed': 2,
    #     'bagging_fraction': 0.8,
    #     'feature_fraction': 0.6,
        'nthread': 4,
        'lambda_l1': 1,
        'lambda_l2': 1,
        'early_stopping_round':200
        } 
    # lgbm_train = lgb.Dataset(phrase, data.label)
    lgbm_train = lgb.Dataset(phrase, onelabels)
    lgbm_val = lgb.Dataset(phrase, onelabels)
    lgbm_model = lgb.train(params,lgbm_train,valid_sets=lgbm_val, verbose_eval=5)
    
    print('finish training')
    
    # save model
    save_path = '../../savedModel/{}/'
    # save tfidf
    pickle.dump(phrase_vectorizer, open(save_path.format(each_model) + "tfidf.pickle", "wb"))
    # save linear svc
    pickle.dump(clf, open(save_path.format(each_model) + "LinearSVC.pickle", "wb"))
    # save logistic
    pickle.dump(log_r, open(save_path.format(each_model) + "Logistic.pickle", "wb"))
    # save lightGBM
    pickle.dump(lgbm_model, open(save_path.format(each_model) + "Lgbm.pickle", "wb"))
    func = func_list[ind]
    result = func(svc=clf, logistic=log_r, lightgbm=lgbm_model, tfidf=phrase_vectorizer, jieba_path='../WordCut/userdict.txt')
    pickle.dump(result, open(save_path.format(each_model) + each_model + '.pickle', "wb"))
    ind = ind + 1
    
    

fitting phrase
transform phrase
Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.219969
[10]	valid_0's multi_error: 0.201872
[15]	valid_0's multi_error: 0.185023




[20]	valid_0's multi_error: 0.165679
[25]	valid_0's multi_error: 0.157878
[30]	valid_0's multi_error: 0.145086
[35]	valid_0's multi_error: 0.137598
[40]	valid_0's multi_error: 0.132605
[45]	valid_0's multi_error: 0.127613
[50]	valid_0's multi_error: 0.125429
[55]	valid_0's multi_error: 0.122309
[60]	valid_0's multi_error: 0.120125
[65]	valid_0's multi_error: 0.117005
[70]	valid_0's multi_error: 0.114821
[75]	valid_0's multi_error: 0.113573
[80]	valid_0's multi_error: 0.1117
[85]	valid_0's multi_error: 0.11014
[90]	valid_0's multi_error: 0.108892
[95]	valid_0's multi_error: 0.107956
[100]	valid_0's multi_error: 0.10546
[105]	valid_0's multi_error: 0.104836
[110]	valid_0's multi_error: 0.104524
[115]	valid_0's multi_error: 0.1039
[120]	valid_0's multi_error: 0.102964
[125]	valid_0's multi_error: 0.103276
[130]	valid_0's multi_error: 0.102028
[135]	valid_0's multi_error: 0.102028
[140]	valid_0's multi_error: 0.10078
[145]	valid_0's multi_error: 0.100156
[150]	valid_0's multi_error: 0.1001

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


[185]	valid_0's multi_error: 0.0982839
[190]	valid_0's multi_error: 0.0982839
[195]	valid_0's multi_error: 0.0982839
[200]	valid_0's multi_error: 0.0982839
[205]	valid_0's multi_error: 0.0982839
[210]	valid_0's multi_error: 0.0982839
[215]	valid_0's multi_error: 0.0982839
[220]	valid_0's multi_error: 0.0982839
[225]	valid_0's multi_error: 0.0982839
[230]	valid_0's multi_error: 0.0982839
[235]	valid_0's multi_error: 0.0982839
[240]	valid_0's multi_error: 0.0982839
[245]	valid_0's multi_error: 0.0982839
[250]	valid_0's multi_error: 0.0982839
[255]	valid_0's multi_error: 0.0982839
[260]	valid_0's multi_error: 0.0982839
[265]	valid_0's multi_error: 0.0982839
[270]	valid_0's multi_error: 0.0982839
[275]	valid_0's multi_error: 0.0982839
[280]	valid_0's multi_error: 0.0982839
[285]	valid_0's multi_error: 0.0982839
[290]	valid_0's multi_error: 0.0982839
[295]	valid_0's multi_error: 0.0982839
[300]	valid_0's multi_error: 0.0982839
[305]	valid_0's multi_error: 0.0982839
[310]	valid_0's multi_err

Loading model cost 0.456 seconds.
Prefix dict has been built succesfully.


fitting phrase
transform phrase
Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.148432
[10]	valid_0's multi_error: 0.12892
[15]	valid_0's multi_error: 0.110801
[20]	valid_0's multi_error: 0.100348
[25]	valid_0's multi_error: 0.0885017
[30]	valid_0's multi_error: 0.0815331
[35]	valid_0's multi_error: 0.0808362
[40]	valid_0's multi_error: 0.0745645
[45]	valid_0's multi_error: 0.0738676
[50]	valid_0's multi_error: 0.0731707
[55]	valid_0's multi_error: 0.0724739
[60]	valid_0's multi_error: 0.0710801
[65]	valid_0's multi_error: 0.0675958
[70]	valid_0's multi_error: 0.0675958
[75]	valid_0's multi_error: 0.066899
[80]	valid_0's multi_error: 0.0655052
[85]	valid_0's multi_error: 0.0648084
[90]	valid_0's multi_error: 0.0634146
[95]	valid_0's multi_error: 0.0634146
[100]	valid_0's multi_error: 0.0634146
[105]	valid_0's multi_error: 0.0634146
[110]	valid_0's multi_error: 0.0620209
[115]	valid_0's multi_error: 0.061324
[120]	valid_0's multi_error: 0.0627



Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.140132
[10]	valid_0's multi_error: 0.124342
[15]	valid_0's multi_error: 0.105044
[20]	valid_0's multi_error: 0.0997807
[25]	valid_0's multi_error: 0.0942982
[30]	valid_0's multi_error: 0.0916667
[35]	valid_0's multi_error: 0.0894737
[40]	valid_0's multi_error: 0.0848684
[45]	valid_0's multi_error: 0.0826754
[50]	valid_0's multi_error: 0.0807018
[55]	valid_0's multi_error: 0.0789474
[60]	valid_0's multi_error: 0.077193
[65]	valid_0's multi_error: 0.0767544
[70]	valid_0's multi_error: 0.0754386
[75]	valid_0's multi_error: 0.0741228
[80]	valid_0's multi_error: 0.0732456
[85]	valid_0's multi_error: 0.0719298
[90]	valid_0's multi_error: 0.0714912
[95]	valid_0's multi_error: 0.0701754
[100]	valid_0's multi_error: 0.0703947
[105]	valid_0's multi_error: 0.0701754
[110]	valid_0's multi_error: 0.0701754
[115]	valid_0's multi_error: 0.0692982
[120]	valid_0's multi_error: 0.0688596
[125]	valid_0's multi_err



Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.201563
[10]	valid_0's multi_error: 0.175625
[15]	valid_0's multi_error: 0.16125
[20]	valid_0's multi_error: 0.14875
[25]	valid_0's multi_error: 0.142187
[30]	valid_0's multi_error: 0.138125
[35]	valid_0's multi_error: 0.132188
[40]	valid_0's multi_error: 0.127812
[45]	valid_0's multi_error: 0.126562
[50]	valid_0's multi_error: 0.123125
[55]	valid_0's multi_error: 0.119375
[60]	valid_0's multi_error: 0.115625
[65]	valid_0's multi_error: 0.113437
[70]	valid_0's multi_error: 0.112187
[75]	valid_0's multi_error: 0.109063
[80]	valid_0's multi_error: 0.108125
[85]	valid_0's multi_error: 0.107188
[90]	valid_0's multi_error: 0.107188
[95]	valid_0's multi_error: 0.10625
[100]	valid_0's multi_error: 0.104375
[105]	valid_0's multi_error: 0.104688
[110]	valid_0's multi_error: 0.10375
[115]	valid_0's multi_error: 0.101562
[120]	valid_0's multi_error: 0.10125
[125]	valid_0's multi_error: 0.100937
[130]	valid_



Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.209657
[10]	valid_0's multi_error: 0.187227
[15]	valid_0's multi_error: 0.175701
[20]	valid_0's multi_error: 0.164174
[25]	valid_0's multi_error: 0.154517
[30]	valid_0's multi_error: 0.142368
[35]	valid_0's multi_error: 0.137383
[40]	valid_0's multi_error: 0.13271
[45]	valid_0's multi_error: 0.129283
[50]	valid_0's multi_error: 0.123676
[55]	valid_0's multi_error: 0.120561
[60]	valid_0's multi_error: 0.118692
[65]	valid_0's multi_error: 0.115888
[70]	valid_0's multi_error: 0.114019
[75]	valid_0's multi_error: 0.113084
[80]	valid_0's multi_error: 0.111526
[85]	valid_0's multi_error: 0.109969
[90]	valid_0's multi_error: 0.11028
[95]	valid_0's multi_error: 0.107788
[100]	valid_0's multi_error: 0.1081
[105]	valid_0's multi_error: 0.107477
[110]	valid_0's multi_error: 0.106854
[115]	valid_0's multi_error: 0.106542
[120]	valid_0's multi_error: 0.105607
[125]	valid_0's multi_error: 0.103738
[130]	valid



Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.242467
[10]	valid_0's multi_error: 0.219929
[15]	valid_0's multi_error: 0.197628
[20]	valid_0's multi_error: 0.181495
[25]	valid_0's multi_error: 0.174852
[30]	valid_0's multi_error: 0.164176
[35]	valid_0's multi_error: 0.159905
[40]	valid_0's multi_error: 0.150652
[45]	valid_0's multi_error: 0.144958
[50]	valid_0's multi_error: 0.139027
[55]	valid_0's multi_error: 0.131673
[60]	valid_0's multi_error: 0.128826
[65]	valid_0's multi_error: 0.125741
[70]	valid_0's multi_error: 0.119336
[75]	valid_0's multi_error: 0.116726
[80]	valid_0's multi_error: 0.114116
[85]	valid_0's multi_error: 0.113405
[90]	valid_0's multi_error: 0.107948
[95]	valid_0's multi_error: 0.103677
[100]	valid_0's multi_error: 0.101779
[105]	valid_0's multi_error: 0.100593
[110]	valid_0's multi_error: 0.0986951
[115]	valid_0's multi_error: 0.0982206
[120]	valid_0's multi_error: 0.0951364
[125]	valid_0's multi_error: 0.0951364
[13