In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
#encoding=utf-8
import pandas as pd
import numpy as np
import re
import string

import jieba
jieba.load_userdict("../../code/WordCut/userdict.txt")

import gc
import pickle
import sys

tpattern_path = '../../../classifier/models/time_pattern/'
sys.path.append(tpattern_path)
from time_pattern import TimePattern
env_path = '../../../classifier/env/'
sys.path.append(env_path)
from env import ENV
log_path = '../../../classifier/lib/'
sys.path.append(log_path)
from log import Logger
matrix_path = '../../../Lib/'
sys.path.append(matrix_path)
from model_matrix import eval_mat
from sklearn.model_selection import KFold

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.900 seconds.
Prefix dict has been built succesfully.


In [2]:
path = '../../data/others/'

# load cleaned other data
other_non109 = pd.read_csv('../../data/others/cleaned_mock_up_data_non109.csv')
other_109 = pd.read_csv('../../data/others/cleaned_mock_up_data_109.csv')
strategy_mat = pd.read_csv(path + 'strategy_mat.csv', encoding='utf8')

print(other_non109.shape)
print(other_109.shape)
print(strategy_mat.shape)

(3950, 2)
(1408, 8)
(16, 9)


In [3]:
class ClassifierOther:
    def __init__(self, **model):
        """
        suggested parameters:
        svc, logistic, nb, jieba_path, tfidf
        """
        self.log = None
        self._load_model(**model)
        self._load_attributes(**model)
        
        
    def _load_model(self,**model):
        self.svc = model.get('svc')
        self.logistic = model.get('logistic')
        self.nb = model.get('nb')
        self.tfidf = model.get('tfidf')
        # load jieba
        jieba_path = model.get('jieba_path')
        if jieba_path is not None:
            jieba.load_userdict(jieba_path)
        
            
    def _load_attributes(self, **model):
        self.label_mapping = model.get('possible_label')
        self.label_mapping = sorted(list(set(self.label_mapping)))
        
    
    def classify(self, sentence):
        """
        input: sentence
        output: result(dictionary)
        """
        if self.log is None:
            self.log = Logger(self.__class__.__name__,level=ENV.MODEL_LOG_LEVEL.value).logger
        sentence = jieba.cut(sentence, cut_all = False)
        sentence = ' '.join(sentence)
        matrix = self.tfidf.transform([sentence])
        self.log.debug('In transfered tfidf, the number of words in vocalbulary is: {}'.format(len(matrix.data)))
        
        result = np.vstack((self.svc.predict_proba(matrix),
                                 self.logistic.predict_proba(matrix),
                                 self.nb.predict_proba(matrix)))
        
        av_pred = np.mean(result, axis = 0)
        max_pred = np.max(av_pred, axis = 0)
        max_arg = np.argmax(av_pred)
        
        label = max_arg
        label = self.label_mapping[label]
            
        dictionary = {'label': label, 'pred_prob': result, 'av_pred': av_pred}
        self.log.debug('Possible labels are: {}'.format(self.label_mapping))
        self.log.debug('Other- Final Pred label is: {}'.format(dictionary['label'])) 
        self.log.debug('Other- svc,logistic,nb result:\n {}'.format(dictionary['pred_prob']))
        self.log.debug('Other- ave result:\n {}'.format(dictionary['av_pred']))
        return dictionary
    
    
class BaseClassifier:
    def __init__(self, **model):
        """
        suggested parameters:
        svc, logistic, nb, jieba_path,tfidf
        """
        self._load_model(**model)
        self.log = None
        
    def warm_up(self):
        self.other.classify('')
        
    def _load_model(self,**model):
        self.svc = model.get('svc')
        self.logistic = model.get('logistic')
        self.nb = model.get('nb')
        self.tfidf = model.get('tfidf')
        self.other = model.get('other')
        # load jieba
        jieba_path = model.get('jieba_path')
        if jieba_path is not None:
            jieba.load_userdict(jieba_path)
            
    def _ext_time(self,sentence, lower_bounder=36, upper_bounder=24*15):
        """
        time label 0: extract length is 0
        time label 2: extract length is 2
        time label 10: extract length is 1, delta time is within the shortest time
        time label 11: extract length is 1, delta time is within the middle time
        time label 12: extract length is 1, delta time is greater than the longest time
        """
        time_extract = self.re_time.process(sentence)
        time_label = 0
        if len(time_extract) == 0:
            time_label = 0
            self.log.debug('No time was extracted!')
        elif len(time_extract) > 1:
            time_label = 2
            self.log.debug('More than 2 times were extracted!')
        else:
            delta = time_extract[0]['gapH']
            self.log.debug('Just one time was extracted! And the time delta is {} hours'.format(delta))
            if delta < lower_bounder:
                time_label = 10
                self.log.debug('The delta is less than lower bounder {} hours'.format(lower_bounder))
            elif lower_bounder <= delta < upper_bounder:
                time_label = 11
                self.log.debug('The delta is greater than lower bounder {} hours but less than upper bounder {} hours'.format(lower_bounder,upper_bounder))
            else:
                time_label = 12
                self.log.debug('The delta is greater than upper bounder {} hours'.format(upper_bounder))
                
        return {'label':time_label,'time_extract':time_extract}
        
    


class IDClassifier(BaseClassifier):
    
       
    def classify(self, sentence,lower_bounder=None,upper_bounder=None):
        """
        ML model wrapper. No time regular expression involved!
        input: sentence - type string
        return label
        """
        if self.log is None:
            self.log = Logger(self.__class__.__name__,level=ENV.MODEL_LOG_LEVEL.value).logger
        sentence = jieba.cut(sentence, cut_all = False)
        sentence = ' '.join(sentence)
        matrix = self.tfidf.transform([sentence])
        self.log.debug('In transfered tfidf, the number of words in vocalbulary is: {}'.format(len(matrix.data)))
        result = np.vstack((self.svc.predict_proba(matrix),
                                 self.logistic.predict_proba(matrix),
                                 self.nb.predict_proba(matrix)))
        
        av_pred = np.mean(result, axis = 0)
        max_pred = np.max(av_pred, axis = 0)
        max_arg = np.argmax(av_pred)
        response = None
        label = max_arg
        if label == 2:
            response = self.other.classify(sentence)
            label = response['label']
            
        dictionary = {'label': label, 'pred_prob': result, 'av_pred': av_pred,'other_response':response}
        self.log.debug('Final Pred label is: {}'.format(dictionary['label']))
        self.log.debug('svc,logistic,nb result:\n {}'.format(dictionary['pred_prob']))
        self.log.debug('ave result:\n {}'.format(dictionary['av_pred']))
        return dictionary

In [4]:
def get_other_data(df_non109,df_109,strategy_mat,classifier):
    possible_label = sorted(list(set(strategy_mat[strategy_mat[classifier]==0]['label'].values)))
    train_data_non109 = df_non109[df_non109['label'].apply(lambda x: x in possible_label)]
    train_data_109 = df_109[df_109[classifier]==0]
    data = pd.concat([train_data_non109,train_data_109],ignore_index=True,sort=True)
    return data

def train_other_model(other_data,save_path,model):
    phrase_vectorizer_other = TfidfVectorizer(ngram_range=(1,3),
                                strip_accents='unicode', 
                                max_features=100000, 
                                analyzer='word',
                                sublinear_tf=True,
                                token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer_other.fit(other_data.text)

    print('transform phrase')
    phrase = phrase_vectorizer_other.transform(other_data.text)


    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, other_data.label)


    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, other_data.label)


    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, other_data.label)
    
    print('finish training others')
    
    
    # other wrapper 
    other_model = ClassifierOther(svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer_other, jieba_path='../WordCut/userdict.txt',possible_label=lsvc.classes_)
    

    print('disable saving to path')
    return other_model
    
    
def train_main_model(df,save_path,model,other_model):
    # get tfidf

    
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer.fit(df.split_text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(df.split_text)
    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    print(df.label.value_counts())
    lsvc.fit(phrase, df.label)
    
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, df.label)
    
    
    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, df.label)
    print('finish training')
    
    main_model = model_list[model](svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer, other=other_model,  jieba_path='../WordCut/userdict.txt')
    
    print('disable saving to path: {}')
    return main_model,lsvc,log_r,naive_b,phrase_vectorizer

# Evaluation

In [5]:
kf = KFold(n_splits=10, shuffle=False, random_state=None)

model_list = {'IDClassifier':IDClassifier, }
each_model = 'IDClassifier'
save_path_other=None
save_path_main=None
other_data = get_other_data(other_non109,other_109,strategy_mat,each_model)
other_data = other_data.sample(frac=1,random_state=19)

other_model = train_other_model(other_data,save_path_other,each_model)
df_main = pd.read_csv('../../data/{}/cleaned_mock_up_data.csv'.format(each_model))
other_label = int(max(set(df_main.label)) + 1)
ava_others = other_data.rename({'text':'split_text'},axis=1)
ava_others['label'] = other_label
try:
    df_main['label'] = df_main['new_label']
    print('using new_label')
except KeyError:
    print('no new_label')
df_main = pd.concat([df_main,ava_others],sort=True)
df_main = df_main.sample(frac=1,random_state=6).reset_index(drop=True)
df_main = df_main.drop_duplicates()
df_main.loc[df_main.label>=other_label,'label']=other_label
fold_val_index = []
fold_train_index = []
ss = kf.split(df_main)
for t,v in ss:
    fold_train_index.append(t)
    fold_val_index.append(v)



fitting phrase
transform phrase
finish training others
disable saving to path
using new_label


In [8]:
train,evl = train_test_split(df_main,test_size=0.2, random_state=43)
print('=====  {} ======='.format(each_model))
print(df_main.label.value_counts())
clf,lsvc,log_r,naive_b,tfidf = train_main_model(train,save_path_main,each_model,other_model)
print('\n')

result = []
for each in evl.split_text.values:
    result.append(clf.classify(each)['label'])
result = np.array(result)
result[result>=2] = 2
evaluation1 = eval_mat(evl.label.values,result)
evaluation1

2    3753
1     558
0     366
Name: label, dtype: int64
fitting phrase
transform phrase
2    2994
1     444
0     303
Name: label, dtype: int64
finish training
disable saving to path: {}




Unnamed: 0,pred_0,pred_1,pred_2,recall
actual_0,38.0,1.0,24.0,0.603175
actual_1,0.0,66.0,48.0,0.578947
actual_2,4.0,5.0,750.0,0.988142
precision,0.904762,0.916667,0.912409,0.912393


In [6]:
fold = 9
train = df_main.iloc[fold_train_index[fold]]
evl = df_main.iloc[fold_val_index[fold]]

print('=====  {} ======='.format(each_model))
print(df_main.label.value_counts())
clf,lsvc,log_r,naive_b,tfidf = train_main_model(train,save_path_main,each_model,other_model)
print('\n')

2    3753
1     558
0     366
Name: label, dtype: int64
fitting phrase
transform phrase
2    3389
1     495
0     326
Name: label, dtype: int64
finish training
disable saving to path: {}




In [73]:
result = []
for each in evl.split_text.values:
    result.append(clf.classify(each)['label'])
result = np.array(result)
result[result>=2] = 2
evaluation1 = eval_mat(evl.label.values,result)
evaluation1

Unnamed: 0,pred_0,pred_1,pred_2,recall
actual_0,19.0,0.0,23.0,0.452381
actual_1,1.0,39.0,22.0,0.629032
actual_2,1.0,0.0,362.0,0.997245
precision,0.904762,1.0,0.889435,0.899358


In [74]:
evl[(evl.label.values != result) & (evl.label.values == 1) &(result!=2)]

Unnamed: 0,ConfirmLoan,CutDebt,IDClassifier,IfKnowDebtor,Installment,WillingToPay,classifier,label,new_label,split_text
4515,,,,,,,IDClassifier,1,1.0,是 你 妹


In [14]:
result = []
for each in evl.split_text.values:
    result.append(clf.classify(each)['label'])
result = np.array(result)
result[result>=2] = 2
evaluation1 = eval_mat(evl.label.values,result)
evaluation1

Unnamed: 0,pred_0,pred_1,pred_2,recall
actual_0,71.0,1.0,32.0,0.682692
actual_1,0.0,116.0,57.0,0.67052
actual_2,5.0,3.0,675.0,0.988287
precision,0.934211,0.966667,0.883508,0.897917


In [45]:
matr = tfidf.transform(evl.split_text.values)
result_lsvc = lsvc.predict(matr)
evaluation1svc = eval_mat(evl.label.values,result_lsvc)
evaluation1svc

Unnamed: 0,pred_0,pred_1,pred_2,recall
actual_0,106.0,1.0,15.0,0.868852
actual_1,1.0,113.0,15.0,0.875969
actual_2,23.0,6.0,306.0,0.913433
precision,0.815385,0.941667,0.910714,0.895904


In [62]:
matr = tfidf.transform(evl.split_text.values)
result_logr = log_r.predict(matr)
evaluation1logr = eval_mat(evl.label.values,result_logr)
evaluation1logr

Unnamed: 0,pred_0,pred_1,pred_2,recall
actual_0,64.0,1.0,51.0,0.551724
actual_1,4.0,109.0,61.0,0.626437
actual_2,14.0,6.0,650.0,0.970149
precision,0.780488,0.939655,0.853018,0.857292


In [63]:
matr = tfidf.transform(evl.split_text.values)
result_nb = naive_b.predict(matr)
evaluation1nb = eval_mat(evl.label.values,result_nb)
evaluation1nb

Unnamed: 0,pred_0,pred_1,pred_2,recall
actual_0,30.0,1.0,85.0,0.258621
actual_1,0.0,76.0,98.0,0.436782
actual_2,3.0,2.0,665.0,0.992537
precision,0.909091,0.962025,0.784198,0.803125


In [66]:
sum(df_main.label.isnull())

0

In [63]:
df_main

Unnamed: 0,ConfirmLoan,CutDebt,IDClassifier,IfKnowDebtor,Installment,WillingToPay,classifier,label,new_label,split_text
0,,,,,,,,2,,你 讲 我 听不懂
1,,,,,,,IDClassifier,1,109.0,都 说 了 多少 次 了
2,,,,,,,IDClassifier,0,0.0,好 的 是 我 你 有 什么 事
3,1.0,1.0,0.0,0.0,1.0,1.0,,2,,要钱 没有 要命 一条
4,,,,,,,,2,,你 怎么 知道 我 号码
5,,,,,,,IDClassifier,1,1.0,别人 的 手机
6,,,,,,,,2,,你 那边 有点 吵 啊
7,,,,,,,,2,,我 没有 听清池
8,,,,,,,IDClassifier,0,0.0,啊 对 的
9,,,,,,,IDClassifier,1,109.0,不关 你 事
