In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB


import sys,os
models_path = '../../../classifier/models/ml_models/'
sys.path.append(models_path)
from ml import *



model_list = {'IDClassifier':IDClassifier, 
                  'CutDebt':CutDebt, 
                  'WillingToPay':WillingToPay,
                  'IfKnowDebtor':IfKnowDebtor,
                  'Installment':Installment,
                  'ConfirmLoan':ConfirmLoan}

../../../classifier/models/ml_models/../time_pattern/../../env/


In [7]:
#encoding=utf-8
import pandas as pd
import numpy as np
import re
import string

import jieba
jieba.load_userdict("../../code/WordCut/userdict.txt")

import gc
import pickle
path = '../../data/others/'

# load cleaned other data
other_non109 = pd.read_csv('../../data/others/cleaned_mock_up_data_non109.csv')
other_109 = pd.read_csv('../../data/others/cleaned_mock_up_data_109.csv')
strategy_mat = pd.read_csv(path + 'strategy_mat.csv', encoding='utf8')

print(other_non109.shape)
print(other_109.shape)
print(strategy_mat.shape)

(3950, 2)
(1408, 8)
(16, 9)


In [8]:
def get_other_data(df_non109,df_109,strategy_mat,classifier):
    possible_label = sorted(list(set(strategy_mat[strategy_mat[classifier]==0]['label'].values)))
    train_data_non109 = df_non109[df_non109['label'].apply(lambda x: x in possible_label)]
    train_data_109 = df_109[df_109[classifier]==0]
    data = pd.concat([train_data_non109,train_data_109],ignore_index=True,sort=True)
    return data

def train_other_model(other_data,save_path,model):
    phrase_vectorizer_other = TfidfVectorizer(ngram_range=(1,3),
                                strip_accents='unicode', 
                                max_features=100000, 
                                analyzer='word',
                                sublinear_tf=True,
                                token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer_other.fit(other_data.text)

    print('transform phrase')
    phrase = phrase_vectorizer_other.transform(other_data.text)


    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, other_data.label)


    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, other_data.label)


    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, other_data.label)
    
    print('finish training others')
    
    
    # other wrapper 
    other_model = ClassifierOther(svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer_other, jieba_path='../WordCut/userdict.txt',possible_label=lsvc.classes_)
    
    # Saving:
    evl_path = save_path.format(model,model)
    print('saving to path: {}'.format(evl_path))
    pickle.dump(other_model, open(evl_path, "wb"))
    return other_model
    
    
def train_main_model(df,save_path,model,other_model):
    # get tfidf
    
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer.fit(df.split_text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(df.split_text)
    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, df.label)
    
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, df.label)
    
    
    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, df.label)
    print('finish training')
    
    main_model = model_list[model](svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer, other=other_model,  jieba_path='../WordCut/userdict.txt')
    evl_path = save_path.format(model,model)
    pickle.dump(main_model, open(evl_path, "wb"))
    print('saving to path: {}'.format(evl_path))
    return main_model
    
    

# Train Other Model + Main Model

In [10]:
# each_model = 'IDClassifier' 
save_path_other = '../../../classifier/saved_model/{}/other_flow/{}.pkl'
save_path_main = '../../../classifier/saved_model/{}/main_flow/{}.pkl'
for each_model in model_list:
    other_data = get_other_data(other_non109,other_109,strategy_mat,each_model)
    other_data = other_data.sample(frac=1,random_state=19)

    other_model = train_other_model(other_data,save_path_other,each_model)

    # train main
    df_main = pd.read_csv('../../data/{}/cleaned_mock_up_data.csv'.format(each_model))
    other_label = int(max(set(df_main.label)) + 1)
    ava_others = other_data.rename({'text':'split_text'},axis=1)
    ava_others['label'] = other_label
    df_main = pd.concat([df_main,ava_others],sort=True)
    df_main = df_main.sample(frac=1,random_state=6).reset_index(drop=True)
    print('=====  {} ======='.format(each_model))
    print(df_main.label.value_counts())
    clf = train_main_model(df_main,save_path_main,each_model,other_model)
    print('\n')

fitting phrase
transform phrase
finish training others
saving to path: ../../../classifier/saved_model/IDClassifier/other_flow/IDClassifier.pkl
2    3486
1    1234
0    1133
Name: label, dtype: int64
fitting phrase
transform phrase
finish training
saving to path: ../../../classifier/saved_model/IDClassifier/main_flow/IDClassifier.pkl


fitting phrase
transform phrase
finish training others
saving to path: ../../../classifier/saved_model/CutDebt/other_flow/CutDebt.pkl
2    4698
0    3623
1    2327
Name: label, dtype: int64
fitting phrase
transform phrase
finish training
Time Zone is set from ENV: Asia/Shanghai
saving to path: ../../../classifier/saved_model/CutDebt/main_flow/CutDebt.pkl


fitting phrase
transform phrase
finish training others
saving to path: ../../../classifier/saved_model/WillingToPay/other_flow/WillingToPay.pkl
3    4576
1    2620
0    1778
2    1038
Name: label, dtype: int64
fitting phrase
transform phrase
finish training
Time Zone is set from ENV: Asia/Shanghai
savi