In [8]:
#encoding=utf-8
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer

import pickle

import sys
sys.path.append('../CutDebt/')
sys.path.append('../IDClassifier/')
sys.path.append('../IfKnowDebtor/')
sys.path.append('../Installment/')
sys.path.append('../SetDueDay/')
sys.path.append('../WillingToPay/')

from CutDebt_py import CutDebt
from IDClassifier_py import IDClassifier
from IfKnowDebtor_py import IfKnowDebtor
from Installment_py import Installment
from SetDueDay_py import SetDueDay
from WillingToPay_py import WillingToPay

In [23]:
model_list = ['CutDebt','IDClassifier','IfKnowDebtor','Installment','SetDueDay','WillingToPay']
func_list = [CutDebt,IDClassifier,IfKnowDebtor,Installment,SetDueDay,WillingToPay]

ind = 0
for each_model in model_list:
    print(each_model)
    
    # load data
    path = '../../data/{}/'
    data = pd.read_csv(path.format(each_model) + 'cleaned_mock_up_data.csv', encoding='utf8')
    
    # get tfidf
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer.fit(data.split_text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(data.split_text)

    # linear svc
    l_svc = LinearSVC()
    clf = CalibratedClassifierCV(l_svc) 
    clf.fit(phrase, data.label)
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, data.label)
    
    # lightGBM
    le = preprocessing.LabelEncoder()
    le.fit(data.label.values)
    onelabels = le.transform(data.label.values)
    multicoder = MultiLabelBinarizer()
    lables = multicoder.fit_transform([data.label.values])
    if each_model == 'WillingToPay':
        params = {
        'learning_rate': 0.2,
        'num_iterations':1000,
        'application': 'multiclassova',
        'num_class': 4,
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'multi_error',
        'data_random_seed': 2,
    #     'bagging_fraction': 0.8,
    #     'feature_fraction': 0.6,
        'nthread': 4,
        'lambda_l1': 1,
        'lambda_l2': 1,
        'early_stopping_round':200
        } 
    else:        
        params = {
        'learning_rate': 0.2,
        'num_iterations':1000,
        'application': 'multiclassova',
        'num_class': 3,
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'multi_error',
        'data_random_seed': 2,
    #     'bagging_fraction': 0.8,
    #     'feature_fraction': 0.6,
        'nthread': 4,
        'lambda_l1': 1,
        'lambda_l2': 1,
        'early_stopping_round':200
        } 
    # lgbm_train = lgb.Dataset(phrase, data.label)
    lgbm_train = lgb.Dataset(phrase, onelabels)
    lgbm_val = lgb.Dataset(phrase, onelabels)
    lgbm_model = lgb.train(params,lgbm_train,valid_sets=lgbm_val, verbose_eval=5)
    
    print('finish training')
    
    # save model
    save_path = '../../savedModel/{}/'
    # save tfidf
    pickle.dump(phrase_vectorizer, open(save_path.format(each_model) + "tfidf.pickle", "wb"))
    # save linear svc
    pickle.dump(clf, open(save_path.format(each_model) + "LinearSVC.pickle", "wb"))
    # save logistic
    pickle.dump(log_r, open(save_path.format(each_model) + "Logistic.pickle", "wb"))
    # save lightGBM
    pickle.dump(lgbm_model, open(save_path.format(each_model) + "Lgbm.pickle", "wb"))
    func = func_list[ind]
    result = func(svc=clf, logistic=log_r, lightgbm=lgbm_model, tfidf=phrase_vectorizer, jieba_path='../WordCut/userdict.txt')
    pickle.dump(result, open(save_path.format(each_model) + each_model + '.pickle', "wb"))
    ind = ind + 1
    
    

CutDebt
fitting phrase
transform phrase




Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.261769
[10]	valid_0's multi_error: 0.254187
[15]	valid_0's multi_error: 0.243315
[20]	valid_0's multi_error: 0.234852
[25]	valid_0's multi_error: 0.227505
[30]	valid_0's multi_error: 0.222274
[35]	valid_0's multi_error: 0.217338
[40]	valid_0's multi_error: 0.211637
[45]	valid_0's multi_error: 0.204761
[50]	valid_0's multi_error: 0.198883
[55]	valid_0's multi_error: 0.196121
[60]	valid_0's multi_error: 0.192183
[65]	valid_0's multi_error: 0.189186
[70]	valid_0's multi_error: 0.185425
[75]	valid_0's multi_error: 0.182427
[80]	valid_0's multi_error: 0.180488
[85]	valid_0's multi_error: 0.178607
[90]	valid_0's multi_error: 0.17749
[95]	valid_0's multi_error: 0.175375
[100]	valid_0's multi_error: 0.173905
[105]	valid_0's multi_error: 0.171848
[110]	valid_0's multi_error: 0.170379
[115]	valid_0's multi_error: 0.168146
[120]	valid_0's multi_error: 0.167029
[125]	valid_0's multi_error: 0.16509
[130]	val



[65]	valid_0's multi_error: 0.0780488
[70]	valid_0's multi_error: 0.0759582
[75]	valid_0's multi_error: 0.0745645
[80]	valid_0's multi_error: 0.0724739
[85]	valid_0's multi_error: 0.0703833
[90]	valid_0's multi_error: 0.0703833
[95]	valid_0's multi_error: 0.0675958
[100]	valid_0's multi_error: 0.0675958
[105]	valid_0's multi_error: 0.0689895
[110]	valid_0's multi_error: 0.066899
[115]	valid_0's multi_error: 0.066899
[120]	valid_0's multi_error: 0.0641115
[125]	valid_0's multi_error: 0.0620209
[130]	valid_0's multi_error: 0.0620209
[135]	valid_0's multi_error: 0.0620209
[140]	valid_0's multi_error: 0.0627178
[145]	valid_0's multi_error: 0.0627178
[150]	valid_0's multi_error: 0.0620209
[155]	valid_0's multi_error: 0.0620209
[160]	valid_0's multi_error: 0.0620209
[165]	valid_0's multi_error: 0.0620209
[170]	valid_0's multi_error: 0.0620209
[175]	valid_0's multi_error: 0.0620209
[180]	valid_0's multi_error: 0.0620209
[185]	valid_0's multi_error: 0.0620209
[190]	valid_0's multi_error: 0.062



Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.24887
[10]	valid_0's multi_error: 0.234675
[15]	valid_0's multi_error: 0.224859
[20]	valid_0's multi_error: 0.21822
[25]	valid_0's multi_error: 0.210946
[30]	valid_0's multi_error: 0.20685
[35]	valid_0's multi_error: 0.201342
[40]	valid_0's multi_error: 0.197811
[45]	valid_0's multi_error: 0.194209
[50]	valid_0's multi_error: 0.19096
[55]	valid_0's multi_error: 0.18863
[60]	valid_0's multi_error: 0.186723
[65]	valid_0's multi_error: 0.183545
[70]	valid_0's multi_error: 0.181073
[75]	valid_0's multi_error: 0.178743
[80]	valid_0's multi_error: 0.177331
[85]	valid_0's multi_error: 0.175
[90]	valid_0's multi_error: 0.173658
[95]	valid_0's multi_error: 0.17154
[100]	valid_0's multi_error: 0.169633
[105]	valid_0's multi_error: 0.168715
[110]	valid_0's multi_error: 0.167232
[115]	valid_0's multi_error: 0.16596
[120]	valid_0's multi_error: 0.165113
[125]	valid_0's multi_error: 0.163489
[130]	valid_0's m



Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.261769
[10]	valid_0's multi_error: 0.254187
[15]	valid_0's multi_error: 0.243315
[20]	valid_0's multi_error: 0.234852
[25]	valid_0's multi_error: 0.227505
[30]	valid_0's multi_error: 0.222274
[35]	valid_0's multi_error: 0.217338
[40]	valid_0's multi_error: 0.211637
[45]	valid_0's multi_error: 0.204761
[50]	valid_0's multi_error: 0.198883
[55]	valid_0's multi_error: 0.196121
[60]	valid_0's multi_error: 0.192183
[65]	valid_0's multi_error: 0.189186
[70]	valid_0's multi_error: 0.185425
[75]	valid_0's multi_error: 0.182427
[80]	valid_0's multi_error: 0.180488
[85]	valid_0's multi_error: 0.178607
[90]	valid_0's multi_error: 0.17749
[95]	valid_0's multi_error: 0.175375
[100]	valid_0's multi_error: 0.173905
[105]	valid_0's multi_error: 0.171848
[110]	valid_0's multi_error: 0.170379
[115]	valid_0's multi_error: 0.168146
[120]	valid_0's multi_error: 0.167029
[125]	valid_0's multi_error: 0.16509
[130]	val



Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.26815
[10]	valid_0's multi_error: 0.252849
[15]	valid_0's multi_error: 0.240125
[20]	valid_0's multi_error: 0.231928
[25]	valid_0's multi_error: 0.223966
[30]	valid_0's multi_error: 0.218189
[35]	valid_0's multi_error: 0.212881
[40]	valid_0's multi_error: 0.206401
[45]	valid_0's multi_error: 0.198439
[50]	valid_0's multi_error: 0.193677
[55]	valid_0's multi_error: 0.188525
[60]	valid_0's multi_error: 0.185246
[65]	valid_0's multi_error: 0.182436
[70]	valid_0's multi_error: 0.180328
[75]	valid_0's multi_error: 0.179235
[80]	valid_0's multi_error: 0.177283
[85]	valid_0's multi_error: 0.174317
[90]	valid_0's multi_error: 0.171975
[95]	valid_0's multi_error: 0.170648
[100]	valid_0's multi_error: 0.168384
[105]	valid_0's multi_error: 0.167057
[110]	valid_0's multi_error: 0.164793
[115]	valid_0's multi_error: 0.1637
[120]	valid_0's multi_error: 0.162451
[125]	valid_0's multi_error: 0.161436
[130]	vali



Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.362798
[10]	valid_0's multi_error: 0.345871
[15]	valid_0's multi_error: 0.334352
[20]	valid_0's multi_error: 0.321716
[25]	valid_0's multi_error: 0.310961
[30]	valid_0's multi_error: 0.298795
[35]	valid_0's multi_error: 0.29139
[40]	valid_0's multi_error: 0.283926
[45]	valid_0's multi_error: 0.277814
[50]	valid_0's multi_error: 0.271878
[55]	valid_0's multi_error: 0.265413
[60]	valid_0's multi_error: 0.261593
[65]	valid_0's multi_error: 0.256538
[70]	valid_0's multi_error: 0.251602
[75]	valid_0's multi_error: 0.248604
[80]	valid_0's multi_error: 0.244549
[85]	valid_0's multi_error: 0.242492
[90]	valid_0's multi_error: 0.239024
[95]	valid_0's multi_error: 0.236791
[100]	valid_0's multi_error: 0.23444
[105]	valid_0's multi_error: 0.232031
[110]	valid_0's multi_error: 0.229738
[115]	valid_0's multi_error: 0.226565
[120]	valid_0's multi_error: 0.224567
[125]	valid_0's multi_error: 0.222274
[130]	val

In [9]:
idc = pickle.load(open("../../savedModel/IDClassifier/IDClassifier.pickle", 'rb'))
cutd = pickle.load(open("../../savedModel/CutDebt/CutDebt.pickle", 'rb'))
ifk = pickle.load(open("../../savedModel/IfKnowDebtor/IfKnowDebtor.pickle", 'rb'))
will = pickle.load(open("../../savedModel/WillingToPay/WillingToPay.pickle", 'rb'))
inst = pickle.load(open("../../savedModel/Installment/Installment.pickle", 'rb'))
setd = pickle.load(open("../../savedModel/SetDueDay/SetDueDay.pickle", 'rb'))

In [24]:
idc.classify('我不是')

(1, [1, 0.99945433087829494])

In [29]:
import jieba

seg_list = jieba.cut('我也没有啊', cut_all=False)
" ".join(seg_list)

'我 也 没有 啊'