In [4]:
#encoding=utf-8
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
# import lightgbm as lgb
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB


import pickle
import sys,os
from all_model_py import CutDebt, IDClassifier, IfKnowDebtor, Installment, WillingToPay, ConfirmLoan


# load data

In [5]:
def sub_df(df,sets,target='label'):
    result = pd.DataFrame()
    for each in sets:
        result = pd.concat([result,df[df[target]==each]])
#     print(result[target].value_counts())
    return result

model_list = {'CutDebt':CutDebt,
              'IDClassifier':IDClassifier,
              'IfKnowDebtor':IfKnowDebtor,
              'Installment':Installment,
              'ConfirmLoan':ConfirmLoan,
              'WillingToPay':WillingToPay}

others = pd.read_csv('../../data/others/cleaned_mock_up_data.csv')
other_matrix = pd.read_csv('../../data/others/strategy_mat.csv')
target = 'label'
save_path = '../../savedModel/{}/'
for model in model_list:
    
    df = pd.read_csv('../../data/{}/cleaned_mock_up_data.csv'.format(model))
#     df = pd.read_csv('../../data/{}/cleaned_mock_up_data_no_dub.csv'.format(model))
    
    other_label = max(set(df.label))
    # filter out other label
    df = df[df.label != other_label]
    # get availabel other labels
    other_set = set(other_matrix[other_matrix[model]==0].label.values)
    ava_others = sub_df(others,other_set)
    ava_others[target] = other_label
    ava_others = ava_others.rename({'text':'split_text'},axis=1)
    df = pd.concat([df,ava_others],sort=True)
    df = df.sample(frac=1).reset_index(drop=True)
    print('=====  {} ======='.format(model))
    print(df.label.value_counts())
    print('begin training!')
    
    # get tfidf
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer.fit(df.split_text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(df.split_text)
    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, df.label)
    
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, df.label)
    
    
    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, df.label)
    print('finish training')
    
    other = pickle.load(open('../../savedModel/others/{}/{}_other.pickle'.format(model,model),'rb'))
    result = model_list[model](svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer, other=other,  jieba_path='../WordCut/userdict.txt')
    pickle.dump(result, open(save_path.format(model) + model + '.pickle', "wb"))
    
    


2    2819
0    1921
1    1603
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
2    2590
0     964
1     810
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
2    2590
0    1097
1     714
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
2    2819
0    1921
1    1494
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
2    2819
0     556
1     526
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
3    2191
1    1527
2    1500
0    1373
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training


In [6]:
idc = pickle.load(open("../../savedModel/IDClassifier/IDClassifier.pickle", 'rb'))
cutd = pickle.load(open("../../savedModel/CutDebt/CutDebt.pickle", 'rb'))
ifk = pickle.load(open("../../savedModel/IfKnowDebtor/IfKnowDebtor.pickle", 'rb'))
will = pickle.load(open("../../savedModel/WillingToPay/WillingToPay.pickle", 'rb'))
inst = pickle.load(open("../../savedModel/Installment/Installment.pickle", 'rb'))
conf = pickle.load(open("../../savedModel/ConfirmLoan/ConfirmLoan.pickle", 'rb'))

In [7]:
import jieba
jieba_path='../WordCut/userdict.txt'
jieba.load_userdict(jieba_path)


In [8]:
sentence = '我草'
sentence = jieba.cut(sentence, cut_all = False)
sentence = ' '.join(sentence)
sentence

'我草'

In [9]:
idc.classify('我日你大爷')

{'label': 110, 'pred_prob': array([[0.04567076, 0.07822013, 0.87610911],
        [0.10065144, 0.13971564, 0.75963292],
        [0.09287949, 0.15289799, 0.75422252]]), 'av_pred': array([0.0797339 , 0.12361125, 0.79665485])}

In [10]:
ifk.classify('我日')

{'label': 104, 'pred_prob': array([[0.08066333, 0.06340724, 0.85592943],
        [0.18250349, 0.09924799, 0.71824852],
        [0.18442428, 0.10575665, 0.70981908]]), 'av_pred': array([0.14919703, 0.08947063, 0.76133234])}

In [11]:
conf.classify('我只借过5000')

{'label': 1, 'pred_prob': array([[0.00473095, 0.87012647, 0.12514258],
        [0.06828095, 0.38884231, 0.54287673],
        [0.09181642, 0.51516007, 0.39302351]]), 'av_pred': array([0.05494278, 0.59137628, 0.35368094]), 'time_extract': []}