In [1]:
import sys,os
sys.path.append('../../../Lib/')
from load_cleaned_data import load_data

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.826 seconds.
Prefix dict has been built succesfully.


In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
import pickle

import sys,os
models_path = '../../../classifier/models/ml_models/'
sys.path.append(models_path)
from ml import *
import pandas as pd



model_list = {
                'IDClassifier':IDClassifier, 
                  'CutDebt':CutDebt, 
                  'WillingToPay':WillingToPay,
                  'IfKnowDebtor':IfKnowDebtor,
                  'Installment':Installment,
                  'ConfirmLoan':ConfirmLoan}


def train_other_model(other_data,save_path,model):
    phrase_vectorizer_other = TfidfVectorizer(ngram_range=(1,3),
                                strip_accents='unicode', 
                                max_features=100000, 
                                analyzer='word',
                                sublinear_tf=True,
                                token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer_other.fit(other_data.text)

    print('transform phrase')
    phrase = phrase_vectorizer_other.transform(other_data.text)


    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, other_data.label)


    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, other_data.label)


    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, other_data.label)
    
    print('finish training others')
    
    
    # other wrapper 
    other_model = ClassifierOther(svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer_other, jieba_path='../WordCut/userdict.txt',possible_label=lsvc.classes_)
    
    # Saving:
    evl_path = save_path.format(model,model)
    print('saving to path: {}'.format(evl_path))
    pickle.dump(other_model, open(evl_path, "wb"))
    return other_model
    
    
def train_main_model(df,save_path,model,other_model):
    # get tfidf
    
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer.fit(df.split_text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(df.split_text)
    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, df.label)
    
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, df.label)
    
    
    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, df.label)
    print('finish training')
    
    main_model = model_list[model](svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer, other=other_model,  jieba_path='../WordCut/userdict.txt')
    evl_path = save_path.format(model,model)
    pickle.dump(main_model, open(evl_path, "wb"))
    print('saving to path: {}'.format(evl_path))
    return main_model
    
    

In [3]:
# each_model = 'IDClassifier' 
clean_data_main,clean_data_other = load_data(load_fb=True)
save_path_other = '../../../classifier/saved_model/{}/other_flow/{}.pkl'
save_path_main = '../../../classifier/saved_model/{}/main_flow/{}.pkl'
for each_model in model_list:
   

    other_model = train_other_model(clean_data_other[each_model],save_path_other,each_model)
    
    df_main = clean_data_main[each_model].copy()
    other_label = int(max(set(df_main.label)) + 1)
    ava_others = clean_data_other[each_model].rename({'text':'split_text'},axis=1).copy()
    ava_others['label'] = other_label
    df_main = pd.concat([df_main,ava_others],sort=True)
    df_main = df_main.sample(frac=1,random_state=6).reset_index(drop=True)
    print('=====  {} ======='.format(each_model))
    print(df_main.label.value_counts())
    clf = train_main_model(df_main,save_path_main,each_model,other_model)
    print('\n')

100%|██████████| 6/6 [00:00<00:00, 45.52it/s]


CutDebt
finish cutting words
1    1434
0    1364
Name: label, dtype: int64
109    1376
106     997
104     907
103     552
108     355
102     266
107     133
110      33
Name: label, dtype: int64
IDClassifier
finish cutting words
1    533
0    339
Name: label, dtype: int64
109    1397
104     952
103     563
107     366
Name: label, dtype: int64
IfKnowDebtor
finish cutting words
0    894
1    519
Name: label, dtype: int64
109    1393
104     952
103     563
107     365
Name: label, dtype: int64
Installment
finish cutting words
1    1368
0    1364
Name: label, dtype: int64
109    1376
106     998
104     907
103     553
108     355
102     277
107     133
110      33
Name: label, dtype: int64
WillingToPay
finish cutting words
1    1947
0     669
Name: label, dtype: int64
109    1375
106     988
104     905
103     551
108     351
102     334
105     202
107     133
Name: label, dtype: int64
ConfirmLoan
finish cutting words
0    1157
1     609
Name: label, dtype: int64
109    1375
104  