In [1]:
#encoding=utf-8
import pandas as pd
import numpy as np
import re
import string

from textblob import TextBlob
from textblob.translate import NotTranslated
from multiprocessing import Pool
from itertools import repeat
from tqdm import tqdm
import jieba
jieba.load_userdict("../WordCut/userdict.txt")

import gc
# from googleapiclient.discovery import build
import sys,os
model_list = ['CutDebt','IDClassifier','IfKnowDebtor','Installment','WillingToPay','ConfirmLoan',]

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.918 seconds.
Prefix dict has been built succesfully.


In [2]:
sys.path.append('../../../classifier/models/time_pattern/')
from  time_pattern import TimePattern
t = TimePattern()



def cut_words(text):
    ##### more -- added by wei
    # this is used to remove time patterns from sentence
    text = re.sub(r' ','TIMESERIES ',text)
    text = t.remove_time(text)
    #########
    seg_list = jieba.cut(text, cut_all=False)
    return " ".join(seg_list)

def clean(text):
    text = re.sub(f'([{string.punctuation}“”¨«»®´·º ½¾¿¡§£₤‘’，])',' ', text)
    text = text.split(' ')
    text = ' '.join(text)
    return text

def clean_label(label):
    return int(label)


def load_others(classifier,
                label_list,
                other_fe = ['text','label'],
                other_path = '../../data/others/labels/{}/mock_up_data_new.csv'):
    """
    classifier: eg, CutDebt
    label_list: eg, [102, 103, 104, 106, 107, 108, 109, 110]
    """
    others = pd.DataFrame()
    for label in label_list:

        df_load = pd.read_csv(other_path.format(label))
        df_availabel = df_load[df_load[classifier] == 0][other_fe].copy()
        others = pd.concat([others,df_availabel],ignore_index=True)
    return others

Time Zone is set from ENV: Asia/Shanghai


# Combine Data

In [3]:
features = ['label','split_text']
ori_data_main = {}
for each_model in tqdm(model_list):
    path = '../../data/{}/'
    ori_data_main[each_model] = pd.read_csv(path.format(each_model) + 'mock_up_data_clean_new.csv', encoding='utf8')
    ori_data_main[each_model] = ori_data_main[each_model][features]
    
#combine CUtDebt and Installment label 0
cut_0 = ori_data_main['CutDebt'][ori_data_main['CutDebt'].label == 0].copy()
ins_0 = ori_data_main['Installment'][ori_data_main['Installment'].label == 0].copy()

ori_data_main['CutDebt'] = pd.concat([ori_data_main['CutDebt'],ins_0],ignore_index=True)
ori_data_main['Installment'] = pd.concat([ori_data_main['Installment'],cut_0],ignore_index=True)


100%|██████████| 6/6 [00:00<00:00, 50.52it/s]


# Load Other Data

In [4]:
### get others data
strategy_mat = pd.read_csv('../../data/others/strategy_mat_v1.csv')
ori_data_other = {}
for each_model in model_list:
    available_labels = list(strategy_mat[strategy_mat[each_model]==0]['label'].unique())
    ori_data_other[each_model] = load_others(each_model,available_labels)

# Clean Data

In [5]:
clean_data_main = {}
clean_data_other = {}
for each_model in model_list:
    print(each_model)

    clean_data_main[each_model] = ori_data_main[each_model].dropna()
    clean_data_other[each_model] = ori_data_other[each_model].dropna()
    col = 'split_text'
    col_other = 'text'
    # cut words
    clean_data_main[each_model][col]=clean_data_main[each_model][col].apply(cut_words)
    clean_data_other[each_model][col_other]=clean_data_other[each_model][col_other].apply(cut_words)
    print('finish cutting words')
    
    # cleaning and save
    clean_data_main[each_model][col] = clean_data_main[each_model][col].apply(clean)
    clean_data_other[each_model][col_other] = clean_data_other[each_model][col_other].apply(clean)
    
    clean_data_main[each_model]['label'] = clean_data_main[each_model]['label'].apply(clean_label)
    clean_data_other[each_model]['label'] = clean_data_other[each_model]['label'].apply(clean_label)

    # shuffle data
    clean_data_main[each_model] = clean_data_main[each_model].sample(frac=1).reset_index(drop=True)
    clean_data_other[each_model] = clean_data_other[each_model].sample(frac=1).reset_index(drop=True)
    

CutDebt
finish cutting words
IDClassifier
finish cutting words
IfKnowDebtor
finish cutting words
Installment
finish cutting words
WillingToPay
finish cutting words
ConfirmLoan
finish cutting words


# Train Model

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
import pickle

import sys,os
models_path = '../../../classifier/models/ml_models/'
sys.path.append(models_path)
from ml import *



model_list = {
                'IDClassifier':IDClassifier, 
                  'CutDebt':CutDebt, 
                  'WillingToPay':WillingToPay,
                  'IfKnowDebtor':IfKnowDebtor,
                  'Installment':Installment,
                  'ConfirmLoan':ConfirmLoan}

In [7]:

def train_other_model(other_data,save_path,model):
    phrase_vectorizer_other = TfidfVectorizer(ngram_range=(1,3),
                                strip_accents='unicode', 
                                max_features=100000, 
                                analyzer='word',
                                sublinear_tf=True,
                                token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer_other.fit(other_data.text)

    print('transform phrase')
    phrase = phrase_vectorizer_other.transform(other_data.text)


    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, other_data.label)


    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, other_data.label)


    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, other_data.label)
    
    print('finish training others')
    
    
    # other wrapper 
    other_model = ClassifierOther(svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer_other, jieba_path='../WordCut/userdict.txt',possible_label=lsvc.classes_)
    
    # Saving:
    evl_path = save_path.format(model,model)
    print('saving to path: {}'.format(evl_path))
    pickle.dump(other_model, open(evl_path, "wb"))
    return other_model
    
    
def train_main_model(df,save_path,model,other_model):
    # get tfidf
    
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer.fit(df.split_text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(df.split_text)
    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, df.label)
    
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, df.label)
    
    
    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, df.label)
    print('finish training')
    
    main_model = model_list[model](svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer, other=other_model,  jieba_path='../WordCut/userdict.txt')
    evl_path = save_path.format(model,model)
    pickle.dump(main_model, open(evl_path, "wb"))
    print('saving to path: {}'.format(evl_path))
    return main_model
    
    

# Train Other Model + Main Model

In [8]:
# each_model = 'IDClassifier' 
save_path_other = '../../../classifier/saved_model/{}/other_flow/{}.pkl'
save_path_main = '../../../classifier/saved_model/{}/main_flow/{}.pkl'
for each_model in model_list:
   

    other_model = train_other_model(clean_data_other[each_model],save_path_other,each_model)
    
    df_main = clean_data_main[each_model].copy()
    other_label = int(max(set(df_main.label)) + 1)
    ava_others = clean_data_other[each_model].rename({'text':'split_text'},axis=1).copy()
    ava_others['label'] = other_label
    df_main = pd.concat([df_main,ava_others],sort=True)
    df_main = df_main.sample(frac=1,random_state=6).reset_index(drop=True)
    print('=====  {} ======='.format(each_model))
    print(df_main.label.value_counts())
    clf = train_main_model(df_main,save_path_main,each_model,other_model)
    print('\n')

fitting phrase
transform phrase
finish training others
saving to path: ../../../classifier/saved_model/IDClassifier/other_flow/IDClassifier.pkl
2    3277
1     530
0     337
Name: label, dtype: int64
fitting phrase
transform phrase
finish training
saving to path: ../../../classifier/saved_model/IDClassifier/main_flow/IDClassifier.pkl


fitting phrase
transform phrase
finish training others
saving to path: ../../../classifier/saved_model/CutDebt/other_flow/CutDebt.pkl
2    4619
1    1432
0    1364
Name: label, dtype: int64
fitting phrase
transform phrase
finish training
Time Zone is set from ENV: Asia/Shanghai
saving to path: ../../../classifier/saved_model/CutDebt/main_flow/CutDebt.pkl


fitting phrase
transform phrase
finish training others
saving to path: ../../../classifier/saved_model/WillingToPay/other_flow/WillingToPay.pkl
2    4839
1    1945
0     663
Name: label, dtype: int64
fitting phrase
transform phrase
finish training
Time Zone is set from ENV: Asia/Shanghai
saving to path