In [2]:
#encoding=utf-8
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer

import pickle

import sys
sys.path.append('../CutDebt/')
sys.path.append('../IDClassifier/')
sys.path.append('../IfKnowDebtor/')
sys.path.append('../Installment/')
sys.path.append('../SetDueDay/')
sys.path.append('../WillingToPay/')

from CutDebt_py import CutDebt
from IDClassifier_py import IDClassifier
from IfKnowDebtor_py import IfKnowDebtor
from Installment_py import Installment
from SetDueDay_py import SetDueDay
from WillingToPay_py import WillingToPay

In [2]:
model_list = ['CutDebt','IDClassifier','IfKnowDebtor','Installment','SetDueDay','WillingToPay']
func_list = [CutDebt,IDClassifier,IfKnowDebtor,Installment,SetDueDay,WillingToPay]

ind = 0
for each_model in model_list:
    print(each_model)
    
    # load data
    path = '../../data/{}/'
    data = pd.read_csv(path.format(each_model) + 'cleaned_mock_up_data.csv', encoding='utf8')
    
    # get tfidf
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer.fit(data.split_text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(data.split_text)

    # linear svc
    l_svc = LinearSVC()
    clf = CalibratedClassifierCV(l_svc) 
    clf.fit(phrase, data.label)
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, data.label)
    
    # lightGBM
    le = preprocessing.LabelEncoder()
    le.fit(data.label.values)
    onelabels = le.transform(data.label.values)
    multicoder = MultiLabelBinarizer()
    lables = multicoder.fit_transform([data.label.values])
    if each_model == 'WillingToPay':
        params = {
        'learning_rate': 0.2,
        'num_iterations':1000,
        'application': 'multiclassova',
        'num_class': 4,
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'multi_error',
        'data_random_seed': 2,
    #     'bagging_fraction': 0.8,
    #     'feature_fraction': 0.6,
        'nthread': 4,
        'lambda_l1': 1,
        'lambda_l2': 1,
        'early_stopping_round':200
        } 
    else:        
        params = {
        'learning_rate': 0.2,
        'num_iterations':1000,
        'application': 'multiclassova',
        'num_class': 3,
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'multi_error',
        'data_random_seed': 2,
    #     'bagging_fraction': 0.8,
    #     'feature_fraction': 0.6,
        'nthread': 4,
        'lambda_l1': 1,
        'lambda_l2': 1,
        'early_stopping_round':200
        } 
    # lgbm_train = lgb.Dataset(phrase, data.label)
    lgbm_train = lgb.Dataset(phrase, onelabels)
    lgbm_val = lgb.Dataset(phrase, onelabels)
    lgbm_model = lgb.train(params,lgbm_train,valid_sets=lgbm_val, verbose_eval=5)
    
    print('finish training')
    
    # save model
    save_path = '../../savedModel/{}/'
    # save tfidf
    pickle.dump(phrase_vectorizer, open(save_path.format(each_model) + "tfidf.pickle", "wb"))
    # save linear svc
    pickle.dump(clf, open(save_path.format(each_model) + "LinearSVC.pickle", "wb"))
    # save logistic
    pickle.dump(log_r, open(save_path.format(each_model) + "Logistic.pickle", "wb"))
    # save lightGBM
    pickle.dump(lgbm_model, open(save_path.format(each_model) + "Lgbm.pickle", "wb"))
    func = func_list[ind]
    result = func(svc=clf, logistic=log_r, lightgbm=lgbm_model, tfidf=phrase_vectorizer, jieba_path='../WordCut/userdict.txt')
    pickle.dump(result, open(save_path.format(each_model) + each_model + '.pickle', "wb"))
    ind = ind + 1
    
    

CutDebt
fitting phrase
transform phrase




Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.262544
[10]	valid_0's multi_error: 0.256448
[15]	valid_0's multi_error: 0.247011
[20]	valid_0's multi_error: 0.236694
[25]	valid_0's multi_error: 0.227902
[30]	valid_0's multi_error: 0.22286
[35]	valid_0's multi_error: 0.217233
[40]	valid_0's multi_error: 0.213072
[45]	valid_0's multi_error: 0.207327
[50]	valid_0's multi_error: 0.204279
[55]	valid_0's multi_error: 0.198593
[60]	valid_0's multi_error: 0.193552
[65]	valid_0's multi_error: 0.188394
[70]	valid_0's multi_error: 0.18517
[75]	valid_0's multi_error: 0.183001
[80]	valid_0's multi_error: 0.181712
[85]	valid_0's multi_error: 0.180129
[90]	valid_0's multi_error: 0.178898
[95]	valid_0's multi_error: 0.176612
[100]	valid_0's multi_error: 0.174502
[105]	valid_0's multi_error: 0.173095
[110]	valid_0's multi_error: 0.17163
[115]	valid_0's multi_error: 0.170516
[120]	valid_0's multi_error: 0.167409
[125]	valid_0's multi_error: 0.16694
[130]	valid

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


finish training


Loading model cost 1.847 seconds.
Prefix dict has been built succesfully.


IDClassifier
fitting phrase
transform phrase
Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.152613
[10]	valid_0's multi_error: 0.130314
[15]	valid_0's multi_error: 0.109408
[20]	valid_0's multi_error: 0.102439
[25]	valid_0's multi_error: 0.0864111
[30]	valid_0's multi_error: 0.0836237
[35]	valid_0's multi_error: 0.0794425
[40]	valid_0's multi_error: 0.0766551
[45]	valid_0's multi_error: 0.0738676
[50]	valid_0's multi_error: 0.071777
[55]	valid_0's multi_error: 0.0710801
[60]	valid_0's multi_error: 0.0689895




[65]	valid_0's multi_error: 0.0682927
[70]	valid_0's multi_error: 0.0675958
[75]	valid_0's multi_error: 0.066899
[80]	valid_0's multi_error: 0.066899
[85]	valid_0's multi_error: 0.0648084
[90]	valid_0's multi_error: 0.0634146
[95]	valid_0's multi_error: 0.0627178
[100]	valid_0's multi_error: 0.0627178
[105]	valid_0's multi_error: 0.0627178
[110]	valid_0's multi_error: 0.0627178
[115]	valid_0's multi_error: 0.0627178
[120]	valid_0's multi_error: 0.0627178
[125]	valid_0's multi_error: 0.0627178
[130]	valid_0's multi_error: 0.0620209
[135]	valid_0's multi_error: 0.0620209
[140]	valid_0's multi_error: 0.0620209
[145]	valid_0's multi_error: 0.0620209
[150]	valid_0's multi_error: 0.0620209
[155]	valid_0's multi_error: 0.0620209
[160]	valid_0's multi_error: 0.0620209
[165]	valid_0's multi_error: 0.0620209
[170]	valid_0's multi_error: 0.0620209
[175]	valid_0's multi_error: 0.0620209
[180]	valid_0's multi_error: 0.0620209
[185]	valid_0's multi_error: 0.0620209
[190]	valid_0's multi_error: 0.062



Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.251483
[10]	valid_0's multi_error: 0.237924
[15]	valid_0's multi_error: 0.227472
[20]	valid_0's multi_error: 0.218927
[25]	valid_0's multi_error: 0.211794
[30]	valid_0's multi_error: 0.205791
[35]	valid_0's multi_error: 0.199788
[40]	valid_0's multi_error: 0.195904
[45]	valid_0's multi_error: 0.192867
[50]	valid_0's multi_error: 0.191879
[55]	valid_0's multi_error: 0.189548
[60]	valid_0's multi_error: 0.187218
[65]	valid_0's multi_error: 0.185169
[70]	valid_0's multi_error: 0.183263
[75]	valid_0's multi_error: 0.179802
[80]	valid_0's multi_error: 0.17846
[85]	valid_0's multi_error: 0.175706
[90]	valid_0's multi_error: 0.174576
[95]	valid_0's multi_error: 0.172387
[100]	valid_0's multi_error: 0.171257
[105]	valid_0's multi_error: 0.169703
[110]	valid_0's multi_error: 0.169068
[115]	valid_0's multi_error: 0.167302
[120]	valid_0's multi_error: 0.166879
[125]	valid_0's multi_error: 0.165819
[130]	va



Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.262544
[10]	valid_0's multi_error: 0.256448
[15]	valid_0's multi_error: 0.247011
[20]	valid_0's multi_error: 0.236694
[25]	valid_0's multi_error: 0.227902
[30]	valid_0's multi_error: 0.22286
[35]	valid_0's multi_error: 0.217233
[40]	valid_0's multi_error: 0.213072
[45]	valid_0's multi_error: 0.207327
[50]	valid_0's multi_error: 0.204279
[55]	valid_0's multi_error: 0.198593
[60]	valid_0's multi_error: 0.193552
[65]	valid_0's multi_error: 0.188394
[70]	valid_0's multi_error: 0.18517
[75]	valid_0's multi_error: 0.183001
[80]	valid_0's multi_error: 0.181712
[85]	valid_0's multi_error: 0.180129
[90]	valid_0's multi_error: 0.178898
[95]	valid_0's multi_error: 0.176612
[100]	valid_0's multi_error: 0.174502
[105]	valid_0's multi_error: 0.173095
[110]	valid_0's multi_error: 0.17163
[115]	valid_0's multi_error: 0.170516
[120]	valid_0's multi_error: 0.167409
[125]	valid_0's multi_error: 0.16694
[130]	valid



Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.267369
[10]	valid_0's multi_error: 0.253474
[15]	valid_0's multi_error: 0.242857
[20]	valid_0's multi_error: 0.232709
[25]	valid_0's multi_error: 0.22623
[30]	valid_0's multi_error: 0.217877
[35]	valid_0's multi_error: 0.210929
[40]	valid_0's multi_error: 0.203981
[45]	valid_0's multi_error: 0.20039
[50]	valid_0's multi_error: 0.195394
[55]	valid_0's multi_error: 0.191023
[60]	valid_0's multi_error: 0.187588
[65]	valid_0's multi_error: 0.185246
[70]	valid_0's multi_error: 0.182826
[75]	valid_0's multi_error: 0.181499
[80]	valid_0's multi_error: 0.17822
[85]	valid_0's multi_error: 0.175488
[90]	valid_0's multi_error: 0.173849
[95]	valid_0's multi_error: 0.172912
[100]	valid_0's multi_error: 0.17096
[105]	valid_0's multi_error: 0.168852
[110]	valid_0's multi_error: 0.166511
[115]	valid_0's multi_error: 0.164715
[120]	valid_0's multi_error: 0.163466
[125]	valid_0's multi_error: 0.161983
[130]	valid



Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.361975
[10]	valid_0's multi_error: 0.343991
[15]	valid_0's multi_error: 0.33206
[20]	valid_0's multi_error: 0.320599
[25]	valid_0's multi_error: 0.308551
[30]	valid_0's multi_error: 0.299794
[35]	valid_0's multi_error: 0.289744
[40]	valid_0's multi_error: 0.282868
[45]	valid_0's multi_error: 0.277931
[50]	valid_0's multi_error: 0.272054
[55]	valid_0's multi_error: 0.266236
[60]	valid_0's multi_error: 0.260829
[65]	valid_0's multi_error: 0.257185
[70]	valid_0's multi_error: 0.254129
[75]	valid_0's multi_error: 0.248545
[80]	valid_0's multi_error: 0.245313
[85]	valid_0's multi_error: 0.242668
[90]	valid_0's multi_error: 0.241552
[95]	valid_0's multi_error: 0.238025
[100]	valid_0's multi_error: 0.235909
[105]	valid_0's multi_error: 0.234734
[110]	valid_0's multi_error: 0.231443
[115]	valid_0's multi_error: 0.229974
[120]	valid_0's multi_error: 0.228681
[125]	valid_0's multi_error: 0.226976
[130]	va

In [3]:
idc = pickle.load(open("../../savedModel/IDClassifier/IDClassifier.pickle", 'rb'))
cutd = pickle.load(open("../../savedModel/CutDebt/CutDebt.pickle", 'rb'))
ifk = pickle.load(open("../../savedModel/IfKnowDebtor/IfKnowDebtor.pickle", 'rb'))
will = pickle.load(open("../../savedModel/WillingToPay/WillingToPay.pickle", 'rb'))
inst = pickle.load(open("../../savedModel/Installment/Installment.pickle", 'rb'))
setd = pickle.load(open("../../savedModel/SetDueDay/SetDueDay.pickle", 'rb'))

In [21]:
inst.classify('我也没有啊')

(0, [0, 0.94353250383033882])

In [20]:
import jieba

seg_list = jieba.cut('我也没有啊', cut_all=False)
" ".join(seg_list)

'我 也 没有 啊'