In [101]:
#encoding=utf-8
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
# import lightgbm as lgb
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB


import pickle
import sys,os
from all_model_py import CutDebt, IDClassifier, IfKnowDebtor, Installment, WillingToPay, ConfirmLoan


# load data

In [110]:
def sub_df(df,sets,target='label'):
    result = pd.DataFrame()
    for each in sets:
        result = pd.concat([result,df[df[target]==each]])
#     print(result[target].value_counts())
    return result

model_list = {'CutDebt':CutDebt,
              'IDClassifier':IDClassifier,
              'IfKnowDebtor':IfKnowDebtor,
              'Installment':Installment,
              'ConfirmLoan':ConfirmLoan,
              'WillingToPay':WillingToPay}

others = pd.read_csv('../../data/others/cleaned_mock_up_data.csv')
other_matrix = pd.read_csv('../../data/others/strategy_mat.csv')
target = 'label'
save_path = '../../savedModel/{}/'
for model in model_list:
    
    df = pd.read_csv('../../data/{}/cleaned_mock_up_data.csv'.format(model))
    other_label = max(set(df.label))
    # filter out other label
    df = df[df.label != other_label]
    # get availabel other labels
    other_set = set(other_matrix[other_matrix[model]==0].label.values)
    ava_others = sub_df(others,other_set)
    ava_others[target] = other_label
    ava_others = ava_others.rename({'text':'split_text'},axis=1)
    df = pd.concat([df,ava_others],sort=True)
    df = df.sample(frac=1).reset_index(drop=True)
    print('=====  {} ======='.format(model))
    print(df.label.value_counts())
    print('begin training!')
    
    # get tfidf
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer.fit(df.split_text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(df.split_text)
    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, df.label)
    
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, df.label)
    
    
    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, df.label)
    print('finish training')
    
    other = pickle.load(open('../../savedModel/others/{}/{}_other.pickle'.format(model,model),'rb'))
    result = model_list[model](svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer, other=other,  jieba_path='../WordCut/userdict.txt')
    pickle.dump(result, open(save_path.format(model) + model + '.pickle', "wb"))
    
    


0    4105
2    2819
1    2144
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
2    2590
0    1446
1    1296
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
2    2590
0    1483
1    1177
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
0    4105
2    2819
1    2144
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
2    2819
0     580
1     553
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
3    2191
0    2106
1    2095
2    2069
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training


In [103]:
idc = pickle.load(open("../../savedModel/IDClassifier/IDClassifier.pickle", 'rb'))
cutd = pickle.load(open("../../savedModel/CutDebt/CutDebt.pickle", 'rb'))
ifk = pickle.load(open("../../savedModel/IfKnowDebtor/IfKnowDebtor.pickle", 'rb'))
will = pickle.load(open("../../savedModel/WillingToPay/WillingToPay.pickle", 'rb'))
inst = pickle.load(open("../../savedModel/Installment/Installment.pickle", 'rb'))
conf = pickle.load(open("../../savedModel/ConfirmLoan/ConfirmLoan.pickle", 'rb'))

In [104]:
import jieba
jieba_path='../WordCut/userdict.txt'
jieba.load_userdict(jieba_path)


In [105]:
sentence = '我草'
sentence = jieba.cut(sentence, cut_all = False)
sentence = ' '.join(sentence)
sentence

'我草'

In [107]:
idc.classify('我日你大爷')

{'label': 110, 'pred_prob': array([[0.04490071, 0.07217199, 0.88292729],
        [0.1048168 , 0.14537581, 0.74980739],
        [0.11577709, 0.21529938, 0.66892353]]), 'av_pred': array([0.0884982 , 0.14428239, 0.76721941])}

In [108]:
ifk.classify('我日')

{'label': 104, 'pred_prob': array([[0.09701567, 0.05465708, 0.84832725],
        [0.20109009, 0.09799387, 0.70091604],
        [0.22235734, 0.14651043, 0.63113222]]), 'av_pred': array([0.1734877 , 0.09972046, 0.72679184])}

In [109]:
conf.classify('我只借过5000')

{'label': 1, 'pred_prob': array([[0.00619887, 0.85350181, 0.14029932],
        [0.06810588, 0.39192211, 0.53997201],
        [0.0915323 , 0.53349763, 0.37497007]]), 'av_pred': array([0.05527902, 0.59297385, 0.35174713]), 'time_extract': []}