In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
import sys,os






In [2]:
import jieba
import numpy as np
import sys,os
# tpattern_path = '../../../classifier/models/time_pattern/'
tpattern_path = '../../../classifier/models/time_extractor/'
sys.path.append(tpattern_path)
from time_pattern import TimePattern


env_path = '../../../classifier/env/'
sys.path.append(env_path)
from env import ENV
log_path = '../../../classifier/lib/'
sys.path.append(log_path)
from log import Logger





class BaseClassifier:
    def __init__(self, **model):
        """
        suggested parameters:
        svc, logistic, nb, jieba_path,tfidf
        """
        self._load_model(**model)
        self.log = None
        self.re_time = TimePattern(logAppendName=self.__class__.__name__)
        
    def warm_up(self):
        self.other.classify('')
        
    def _load_model(self,**model):
        self.svc = model.get('svc')
        self.logistic = model.get('logistic')
        self.nb = model.get('nb')
        self.tfidf = model.get('tfidf')
        self.other = model.get('other')
        self.weights = model.get('weights')
        # load jieba
        jieba_path = model.get('jieba_path')
        if jieba_path is not None:
            jieba.load_userdict(jieba_path)
            
    def _ext_time(self,sentence, lower_bounder='明天下午5点', upper_bounder='1个月'):
        """
        time label 0: extract length is 0
        time label 2: extract length is 2
        time label 10: extract length is 1, delta time is within the shortest time
        time label 11: extract length is 1, delta time is within the middle time
        time label 12: extract length is 1, delta time is greater than the longest time
        """
        lower_bounder_hour = self.re_time.process(lower_bounder)[0]['gapH']
        upper_bounder_hour = self.re_time.process(upper_bounder)[0]['gapH']
        time_extract = self.re_time.process(sentence)
        time_label = 0
        if len(time_extract) == 0:
            time_label = 0
            self.log.debug('No time was extracted!')
        elif len(time_extract) > 1:
            time_label = 2
            self.log.debug('More than 2 times were extracted!')
        else:
            delta = time_extract[0]['gapH']
            self.log.debug('Just one time was extracted! And the time delta is {} hours'.format(delta))
            if delta < lower_bounder_hour:
                time_label = 10
                self.log.debug('The delta is less than lower bounder {} hours'.format(lower_bounder_hour))
            elif lower_bounder_hour <= delta < upper_bounder_hour:
                time_label = 11
                self.log.debug('The delta is greater than lower bounder {} hours but less than upper bounder {} hours'.format(lower_bounder_hour,upper_bounder_hour))
            else:
                time_label = 12
                self.log.debug('The delta is greater than upper bounder {} hours'.format(upper_bounder))
                
        return {'label':time_label,
                'time_extract':time_extract,
                'lower_bounder_hour':lower_bounder_hour,
                'upper_bounder_hour':upper_bounder_hour}
    
    def preds_ml(self,sentence,removeTime=True):
        if self.log is None:
            self.log = Logger(self.__class__.__name__,level=ENV.MODEL_LOG_LEVEL.value).logger
        if removeTime:
            sentence = self.re_time.remove_time(sentence)
        sentence = jieba.cut(sentence, cut_all = False)
        sentence = ' '.join(sentence)
        matrix = self.tfidf.transform([sentence])
        self.log.debug('In transfered tfidf, the number of words in vocalbulary is: {}'.format(len(matrix.data)))
        result = np.vstack((self.svc.predict_proba(matrix),
                                 self.logistic.predict_proba(matrix),
                                 self.nb.predict_proba(matrix)))
        if self.weights is not None:
            if len(self.weights) == result.shape[0]:
                av_pred = result[0] * self.weights[0]
                for r_idx in range(1,result.shape[0]):
                    av_pred += result[r_idx] * self.weights[r_idx]
                av_pred = av_pred / sum(self.weights)
        else:
            av_pred = np.mean(result, axis = 0)
        max_pred = np.max(av_pred, axis = 0)
        max_arg = np.argmax(av_pred)
        response = None
        label = max_arg
        if label == 2:
            response = self.other.classify(sentence)
            label = response['label']
        return label,result,av_pred,response
        
    


class IDClassifier(BaseClassifier):
    
    def __init__(self,**model):
        super().__init__(**model)
        self.label_meaning = 'ifDebtorAnswersing'
        self.label_meaning_map = {0:'y',1:'n'}
       
    def classify(self, sentence,lower_bounder=None,upper_bounder=None,debug=False):
        """
        ML model wrapper. No time regular expression involved!
        input: sentence - type string
        return label
        """
        ml_label,result,av_pred,response = self.preds_ml(sentence)
        label = ml_label
        if debug:
            dictionary = {'label': ml_label, 'pred_prob': result,
                          'av_pred': av_pred,'other_response':response,'ml_label':ml_label}
        else:
            if response is not None:
                response = float(max(response['av_pred']))
            av_pred_value = float(max(av_pred))
            dictionary = {'label': label, 'av_pred': av_pred_value,
                          'other_response':response,'ml_label':ml_label}
        self.log.debug('Final Pred label is: {}'.format(label))
        dictionary.update({self.label_meaning:self.label_meaning_map.get(label,'null')})
        return dictionary
    
    
    

class IfKnowDebtor(BaseClassifier):
    
    def __init__(self,**model):
        super().__init__(**model)
        self.label_meaning = 'ifKnowDebtor'
        self.label_meaning_map = {0:'y',1:'n'}
        
        
    def classify(self, sentence,lower_bounder=None,upper_bounder=None,debug=False):
        """
        ML model wrapper. No time regular expression involved!
        input: sentence - type string
        return label
        """
        ml_label,result,av_pred,response = self.preds_ml(sentence)
        label = ml_label
        if debug:
            dictionary = {'label': label, 'pred_prob': result,
                          'av_pred': av_pred,'other_response':response,'ml_label':ml_label}
        else:
            if response is not None:
                response = float(max(response['av_pred']))
            av_pred_value = float(max(av_pred))
            dictionary = {'label': label, 'av_pred': av_pred_value,
                          'other_response':response,'ml_label':ml_label}
        
        self.log.debug('Final Pred label is: {}'.format(label))
        dictionary.update({self.label_meaning:self.label_meaning_map.get(label,'null')})
        return dictionary
    
    
    

    
class ConfirmLoan(BaseClassifier):
    
    def __init__(self,**model):
        super().__init__(**model)
        self.label_meaning = 'ifAdmitLoan'
        self.label_meaning_map = {0:'y',1:'n'}
        
    def classify(self, 
                 sentence,
                 lower_bounder='明天下午5点', 
                 upper_bounder='1个月',debug=False):
        """
        if len(time_extract) == 0 --> run through ML
        if len(time_extract) == 1(within short time) --> jump to n103
            other --> jump to n15
        """
        ml_label,result,av_pred,response = self.preds_ml(sentence)
        label = ml_label
     
        if debug:
            dictionary = {'label': label, 'pred_prob': result, 
                          'av_pred': av_pred,'other_response':response,'ml_label':ml_label}
        else:
            if response is not None:
                response = float(max(response['av_pred']))
            av_pred_value = float(max(av_pred))
            dictionary = {'label': label, 'av_pred': av_pred_value,
                          'other_response':response,'ml_label':ml_label}
        self.log.debug('Final Pred label is: {}'.format(label))
        dictionary.update({self.label_meaning:self.label_meaning_map.get(label,'null')})
        return dictionary
    
    
    

    
class WillingToPay(BaseClassifier):
    def __init__(self,**model):
        super().__init__(**model)
        self.label_meaning = 'ifWillingToPay'
        self.label_meaning_map = {0:'y',1:'n',10:'confirmAgain'}
    
        
        
    def classify(self, 
                 sentence,
                 lower_bounder='明天下午5点', 
                 upper_bounder='1个月',debug=False):
        """
        0 - high willing to pay (ML + Reg, between short and long)
        1 - not willing to pay (ML + Reg, too long)
        2 - other
        Re:
        if time len(extract) >=2, and the min time is within the tolerance --> connect to self and confirm which day to pay,
                                    output label is 10
        if time len(extract) ==1, and the min time is within the tolerance --> run through ML
                                    and the min time is within the middle time --> not run ML, connect to self, output label 1,
                                    and the min time is longer than the longest time --> no ML, connect to self,output1 sentiment +1
        """
        if self.log is None:
            self.log = Logger(self.__class__.__name__,level=ENV.MODEL_LOG_LEVEL.value).logger
        dictionary = {}
        # Regular expression
        time_result = self._ext_time(sentence,lower_bounder, upper_bounder)
        time_label = time_result['label']
        time_extract = time_result['time_extract']
        lower_bounder_hour = time_result['lower_bounder_hour']
        upper_bounder_hour = time_result['upper_bounder_hour']
        response = None
        
        ml_label,result,av_pred,response = self.preds_ml(sentence)
        label = ml_label
        
        
        if time_label == 2:   
            min_time = time_extract[0]['gapH']
            for each in time_extract[1:]:
                _time = each['gapH']
                if _time < min_time:
                    min_time = _time      
            if min_time <= lower_bounder_hour:
                self.log.debug('There are more than 1 time extracted. And the min {} hours is shorter than lower bounder! The output label is set to 10!'.format(min_time))
                label = 10
            else:
                label = 1
                
            dictionary.update({'label': label, 'av_pred': 1.0,
                          'other_response':0.0,'timeExtract':time_extract,'ml_label':ml_label})
            dictionary.update({self.label_meaning:self.label_meaning_map.get(label,'null')})
            return dictionary
        else:    
            
            # interact with regular expression
            if (time_label == 2) and (label != 1):
                label = 10
            
            ####### interact with Regular expression
            if time_label == 11:
                label = 1
            elif time_label == 12:
                label = 1
                dictionary.update({'add_sentiment':1})
            if debug:
                dictionary.update({'label': label, 'pred_prob': result, 
                              'av_pred': av_pred,'other_response':response,'ml_label':ml_label})
            else:
                if response is not None:
                    response = float(max(response['av_pred']))
                av_pred_value = float(max(av_pred))
                dictionary.update({'label': label, 'av_pred': av_pred_value,
                              'other_response':response,'ml_label':ml_label})
            dictionary.update({'timeExtract':time_extract})
            self.log.debug('Final Pred label is: {}'.format(label))
            dictionary.update({self.label_meaning:self.label_meaning_map.get(label,'null')})
            return dictionary
    
    
    

class CutDebt(BaseClassifier):
    def __init__(self,**model):
        super().__init__(**model)
        self.label_meaning = 'ifAcceptCutDebt'
        self.label_meaning_map = {0:'y',1:'n'}
        
    def classify(self, 
                 sentence,
                 lower_bounder='明天下午5点', 
                 upper_bounder='1个月',debug=False):
        """
        Re:
        if time len(extract) >=2, and the min time is within the tolerance --> connect to self and confirm which day to pay,
                                    output label is 10
        if time len(extract) ==1, and the min time is within the tolerance --> run through ML
                                    and the min time is within the middle time --> not run ML, connect to self, output label 1,
                                    and the min time is longer than the longest time --> no ML, connect to self,output1 sentiment +1
        """
        if self.log is None:
            self.log = Logger(self.__class__.__name__,level=ENV.MODEL_LOG_LEVEL.value).logger
        dictionary = {}
        # Regular expression
        time_result = self._ext_time(sentence,lower_bounder, upper_bounder)
        time_label = time_result['label']
        time_extract = time_result['time_extract'] 
        lower_bounder_hour = time_result['lower_bounder_hour']
        upper_bounder_hour = time_result['upper_bounder_hour']
        response = None
        
        ml_label,result,av_pred,response = self.preds_ml(sentence)
        label = ml_label
            
        if time_label == 2:   
            min_time = time_extract[0]['gapH']
            for each in time_extract[1:]:
                _time = each['gapH']
                if _time < min_time:
                    min_time = _time      
            if min_time <= lower_bounder_hour:
                self.log.debug('There are more than 1 time extracted. And the min {} hours is shorter than lower bounder! The output label is set to 10!'.format(min_time))
                label = 10
            else:
                label = 1
                
            dictionary = {'label': label, 'av_pred': 1.0,
                          'other_response':0.0,'timeExtract':time_extract,'ml_label':ml_label} 
            dictionary.update({self.label_meaning:self.label_meaning_map.get(label,'null')})
            return dictionary
        else:

            # interact with regular expression
            if (time_label == 2) and (label != 1):
                label = 10
            
            ####### interact with Regular expression
            if time_label == 11:
                label = 1
            elif time_label == 12:
                dictionary.update({'add_sentiment':1})
                label = 1
            if debug:
                dictionary.update({'label': label, 'pred_prob': result, 
                              'av_pred': av_pred,'other_response':response,'ml_label':ml_label})
            else:
                if response is not None:
                    response = float(max(response['av_pred']))
                av_pred_value = float(max(av_pred))
                dictionary.update({'label': label, 'av_pred': av_pred_value,
                              'other_response':response,'ml_label':ml_label})
            dictionary.update({'timeExtract':time_extract})
            self.log.debug('Final Pred label is: {}'.format(label))
            dictionary.update({self.label_meaning:self.label_meaning_map.get(label,'null')})
            return dictionary
    
    
    
class Installment(BaseClassifier):
    def __init__(self,**model):
        super().__init__(**model)
        self.label_meaning = 'ifAcceptInstallment'
        self.label_meaning_map = {0:'y',1:'n'}
        
        
    def classify(self, 
                 sentence,
                 lower_bounder='明天下午5点', 
                 upper_bounder='1个月',debug=False):
        """
        Re:
        if time len(extract) >=2, and the min time is within the tolerance --> connect to self and confirm which day to pay,
                                    output label is 10
        if time len(extract) ==1, and the min time is within the tolerance --> run through ML
                                    and the min time is within the middle time --> not run ML, connect to self, output label 1,
                                    and the min time is longer than the longest time --> no ML, connect to self,output1 sentiment +1
        """
        if self.log is None:
            self.log = Logger(self.__class__.__name__,level=ENV.MODEL_LOG_LEVEL.value).logger
        dictionary= {}
        # Regular expression
        time_result = self._ext_time(sentence,lower_bounder, upper_bounder)
        time_label = time_result['label']
        time_extract = time_result['time_extract']
        lower_bounder_hour = time_result['lower_bounder_hour']
        upper_bounder_hour = time_result['upper_bounder_hour']
        response = None
        
        ml_label,result,av_pred,response = self.preds_ml(sentence)
        label = ml_label
            
        if time_label == 2:   
            min_time = time_extract[0]['gapH']
            for each in time_extract[1:]:
                _time = each['gapH']
                if _time < min_time:
                    min_time = _time      
            if min_time <= lower_bounder_hour:
                self.log.debug('There are more than 1 time extracted. And the min {} hours is shorter than lower bounder! The output label is set to 10!'.format(min_time))
                label = 10
            else:
                label = 1
                
            dictionary.update({'label': label, 'av_pred': 1.0,
                          'other_response':0.0,'timeExtract':time_extract,'ml_label':ml_label})
            dictionary.update({self.label_meaning:self.label_meaning_map.get(label,'null')})
            return dictionary
        else:

            ####### interact with Regular expression
            if time_label == 11:
                label = 1
            elif time_label == 12:
                label = 1
                dictionary.update({'add_sentiment':1})
            if debug:
                dictionary.update({'label': label, 'pred_prob': result, 
                              'av_pred': av_pred,'other_response':response,'ml_label':ml_label})
            else:
                if response is not None:
                    response = float(max(response['av_pred']))
                av_pred_value = float(max(av_pred))
                dictionary.update({'label': label, 'av_pred': av_pred_value,
                              'other_response':response,'ml_label':ml_label})
            dictionary.update({'timeExtract':time_extract})
            self.log.debug('Final Pred label is: {}'.format(label))
            dictionary.update({self.label_meaning:self.label_meaning_map.get(label,'null')})
            return dictionary



class ClassifierOther:
    def __init__(self, **model):
        """
        suggested parameters:
        svc, logistic, nb, jieba_path, tfidf
        """
        self.log = None
        self._load_model(**model)
        self._load_attributes(**model)
        
        
    def _load_model(self,**model):
        self.svc = model.get('svc')
        self.logistic = model.get('logistic')
        self.nb = model.get('nb')
        self.tfidf = model.get('tfidf')
        self.weights = model.get('weights')
        # load jieba
        jieba_path = model.get('jieba_path')
        if jieba_path is not None:
            jieba.load_userdict(jieba_path)
        
            
    def _load_attributes(self, **model):
        self.label_mapping = model.get('possible_label')
        self.label_mapping = sorted(list(set(self.label_mapping)))
        
    
    def classify(self, sentence):
        """
        input: sentence
        output: result(dictionary)
        """
        if self.log is None:
            self.log = Logger(self.__class__.__name__,level=ENV.MODEL_LOG_LEVEL.value).logger
        sentence = jieba.cut(sentence, cut_all = False)
        sentence = ' '.join(sentence)
        matrix = self.tfidf.transform([sentence])
        self.log.debug('In transfered tfidf, the number of words in vocalbulary is: {}'.format(len(matrix.data)))
        
        result = np.vstack((self.svc.predict_proba(matrix),
                                 self.logistic.predict_proba(matrix),
                                 self.nb.predict_proba(matrix))) 
        
        if self.weights is not None:
            if len(self.weights) == result.shape[0]:
                av_pred = result[0] * self.weights[0]
                for r_idx in range(1,result.shape[0]):
                    av_pred += result[r_idx] * self.weights[r_idx]
                av_pred = av_pred / sum(self.weights)
        else:
            av_pred = np.mean(result, axis = 0)
        
        av_pred = np.mean(result, axis = 0)
        max_pred = np.max(av_pred, axis = 0)
        max_arg = np.argmax(av_pred)
        
        label = max_arg
        label = self.label_mapping[label]
            
        dictionary = {'label': label, 'pred_prob': result, 'av_pred': av_pred}
        self.log.debug('Possible labels are: {}'.format(self.label_mapping))
        self.log.debug('Other- Final Pred label is: {}'.format(dictionary['label']))
        self.log.debug('Other- svc,logistic,nb result:\n {}'.format(dictionary['pred_prob']))
        self.log.debug('Other- ave result:\n {}'.format(dictionary['av_pred']))
        return dictionary

In [3]:
model_list = {
                'IDClassifier':IDClassifier, 
                  'CutDebt':CutDebt, 
                  'WillingToPay':WillingToPay,
                  'IfKnowDebtor':IfKnowDebtor,
                  'Installment':Installment,
                  'ConfirmLoan':ConfirmLoan}

In [4]:
#encoding=utf-8
import pandas as pd
import numpy as np
import re
import string

import jieba
jieba.load_userdict("../../code/WordCut/userdict.txt")
import gc


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.830 seconds.
Prefix dict has been built succesfully.


In [52]:


def train_other_model(other_data,save_path,model):
    phrase_vectorizer_other = TfidfVectorizer(ngram_range=(1,3),
                                strip_accents='unicode', 
                                max_features=100000, 
                                analyzer='word',
                                sublinear_tf=True,
                                token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer_other.fit(other_data.text)

    print('transform phrase')
    phrase = phrase_vectorizer_other.transform(other_data.text)


    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, other_data.label)


    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, other_data.label)


    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, other_data.label)
    
    print('finish training others')
    
    
    # other wrapper 
    other_model = ClassifierOther(svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer_other, jieba_path='../WordCut/userdict.txt',possible_label=lsvc.classes_)
    
    return other_model
    
    
def train_main_model(df,save_path,model,other_model):
    # get tfidf
    
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')
    

    print('fitting phrase')
    phrase_vectorizer.fit(df.split_text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(df.split_text)
    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, df.label)
    
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, df.label)
    
    
    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, df.label)
    print('finish training')
    
    main_model = model_list[model](svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer, other=other_model,  jieba_path='../WordCut/userdict.txt',weights=[5,1,1])

    return main_model
    
    

# Load Data

In [53]:
import sys,os
sys.path.append('../../../Lib/')
from load_cleaned_data import load_data
clean_data_main,clean_data_other = load_data()

100%|██████████| 6/6 [00:00<00:00, 46.72it/s]


CutDebt
finish cutting words
1    1434
0    1364
Name: label, dtype: int64
109    1376
106     997
104     907
103     552
108     355
102     266
107     133
110      33
Name: label, dtype: int64
IDClassifier
finish cutting words
1    533
0    339
Name: label, dtype: int64
109    1397
104     952
103     563
107     366
Name: label, dtype: int64
IfKnowDebtor
finish cutting words
0    894
1    519
Name: label, dtype: int64
109    1393
104     952
103     563
107     365
Name: label, dtype: int64
Installment
finish cutting words
1    1368
0    1364
Name: label, dtype: int64
109    1376
106     998
104     907
103     553
108     355
102     277
107     133
110      33
Name: label, dtype: int64
WillingToPay
finish cutting words
1    1947
0     669
Name: label, dtype: int64
109    1375
106     988
104     905
103     551
108     351
102     334
105     202
107     133
Name: label, dtype: int64
ConfirmLoan
finish cutting words
0    1157
1     609
Name: label, dtype: int64
109    1375
104  

# Train Other Model + Main Model

# Train IDClassifier

In [54]:
# each_model = 'IDClassifier' 
save_path_other = ''
save_path_main = ''

each_model = 'IDClassifier'
other_model = train_other_model(clean_data_other[each_model],save_path_other,each_model)
    
df_main = clean_data_main[each_model].copy()
other_label = int(max(set(df_main.label)) + 1)
ava_others = clean_data_other[each_model].rename({'text':'split_text'},axis=1).copy()
ava_others['label'] = other_label
df_main = pd.concat([df_main,ava_others],sort=True)
df_main = df_main.sample(frac=1,random_state=6).reset_index(drop=True)
print('=====  {} ======='.format(each_model))
print(df_main.label.value_counts())
clf = train_main_model(df_main,save_path_main,each_model,other_model)
print('\n')

fitting phrase
transform phrase
finish training others
2    3278
1     533
0     339
Name: label, dtype: int64
fitting phrase
transform phrase
finish training




In [55]:
df_main

Unnamed: 0,label,split_text
0,2,你们 通过 什么 渠道 知道 我 的 电话 是 谁 卖 给 你们 的 ？
1,2,我 不想 说
2,2,打 豆豆 了
3,2,我 在 开会 等 下 再说 吧
4,2,韩国 赢 了
5,0,我 在
6,2,光脚 的 不怕 穿鞋 的
7,1,他 因为 欠钱 躲起来 了
8,2,你 说 什么 我 在 地铁 里
9,0,唉 是 的 是 的


In [59]:
for each in clf.tfidf.vocabulary_:
    if each.find('？')!=-1:
        print(each)

In [8]:
clf.classify('是的')

here2


{'label': 0,
 'av_pred': 0.8820170025103654,
 'other_response': None,
 'ml_label': 0,
 'ifDebtorAnswersing': 'y'}

In [8]:
clf.classify('是的')

[[9.85350024e-01 1.46383107e-02 1.16649597e-05]
 [8.76255068e-01 1.05124307e-01 1.86206254e-02]
 [3.79243623e-01 4.77753862e-02 5.72980991e-01]]


{'label': 0,
 'av_pred': 0.7469495714705526,
 'other_response': None,
 'ml_label': 0,
 'ifDebtorAnswersing': 'y'}

# Train Confirm Loan

In [9]:
# each_model = 'IDClassifier' 
save_path_other = ''
save_path_main = ''

each_model = 'ConfirmLoan'
other_model = train_other_model(clean_data_other[each_model],save_path_other,each_model)
    
df_main = clean_data_main[each_model].copy()
other_label = int(max(set(df_main.label)) + 1)
ava_others = clean_data_other[each_model].rename({'text':'split_text'},axis=1).copy()
ava_others['label'] = other_label
df_main = pd.concat([df_main,ava_others],sort=True)
df_main = df_main.sample(frac=1,random_state=6).reset_index(drop=True)
print('=====  {} ======='.format(each_model))
print(df_main.label.value_counts())
clf = train_main_model(df_main,save_path_main,each_model,other_model)
print('\n')

fitting phrase
transform phrase
finish training others
saving to path: 
2    3300
0    1157
1     609
Name: label, dtype: int64
fitting phrase
transform phrase
finish training
saving to path: 




In [10]:
clf.classify('没钱')

here2


NameError: name 'time_extract' is not defined

# Train WilingToPay

In [10]:
# each_model = 'IDClassifier' 
save_path_other = ''
save_path_main = ''

each_model = 'WillingToPay'
other_model = train_other_model(clean_data_other[each_model],save_path_other,each_model)
    
df_main = clean_data_main[each_model].copy()
other_label = int(max(set(df_main.label)) + 1)
ava_others = clean_data_other[each_model].rename({'text':'split_text'},axis=1).copy()
ava_others['label'] = other_label
df_main = pd.concat([df_main,ava_others],sort=True)
df_main = df_main.sample(frac=1,random_state=6).reset_index(drop=True)
print('=====  {} ======='.format(each_model))
print(df_main.label.value_counts())
clf = train_main_model(df_main,save_path_main,each_model,other_model)
print('\n')

fitting phrase
transform phrase
finish training others
saving to path: 
2    4839
1    1947
0     669
Name: label, dtype: int64
fitting phrase
transform phrase
finish training
saving to path: 




In [11]:
clf.classify('过58分钟可以明年不行',lower_bounder='明天下午5点')

2018-09-10 19:51:40,491 - INFO - CLASS:EvlTimeExpEngineWillingToPay- METHOD:_set_timeZone -LINE:748 - MSG:Time Zone is set from ENV: Asia/Shanghai. Classifier: WillingToPay


{'label': 10,
 'av_pred': 1.0,
 'other_response': 0.0,
 'timeExtract': [{'pattern': '过58分钟',
   'time': datetime.datetime(2018, 9, 11, 4, 49, tzinfo=<DstTzInfo 'Asia/Shanghai' CST+8:00:00 STD>),
   'gapS': 3439.473664,
   'gapH': 0.9554093511111111,
   'gapD': 0.039808722962962964,
   'exp': '?y-?m-?d-?H:+58M:00S'},
  {'pattern': '明年',
   'time': datetime.datetime(2019, 9, 11, 3, 51, tzinfo=<DstTzInfo 'Asia/Shanghai' CST+8:00:00 STD>),
   'gapS': 31535959.473581,
   'gapH': 8759.98874266139,
   'gapD': 364.99953094422455,
   'exp': '^1y-?m-?d-?H:?M:00S'}],
 'ml_label': 1,
 'ifWillingToPay': 'confirmAgain'}

# Installment

In [12]:
# each_model = 'IDClassifier' 
save_path_other = ''
save_path_main = ''

each_model = 'Installment'
other_model = train_other_model(clean_data_other[each_model],save_path_other,each_model)
    
df_main = clean_data_main[each_model].copy()
other_label = int(max(set(df_main.label)) + 1)
ava_others = clean_data_other[each_model].rename({'text':'split_text'},axis=1).copy()
ava_others['label'] = other_label
df_main = pd.concat([df_main,ava_others],sort=True)
df_main = df_main.sample(frac=1,random_state=6).reset_index(drop=True)
print('=====  {} ======='.format(each_model))
print(df_main.label.value_counts())
clf = train_main_model(df_main,save_path_main,each_model,other_model)
print('\n')

fitting phrase
transform phrase
finish training others
saving to path: 
2    4632
1    1368
0    1364
Name: label, dtype: int64
fitting phrase
transform phrase
finish training
saving to path: 




In [13]:
clf.classify('过58分钟可以明年不行',lower_bounder='明天下午5点')

2018-09-10 19:51:45,681 - INFO - CLASS:EvlTimeExpEngineInstallment- METHOD:_set_timeZone -LINE:748 - MSG:Time Zone is set from ENV: Asia/Shanghai. Classifier: Installment


{'label': 10,
 'av_pred': 1.0,
 'other_response': 0.0,
 'timeExtract': [{'pattern': '过58分钟',
   'time': datetime.datetime(2018, 9, 11, 4, 49, tzinfo=<DstTzInfo 'Asia/Shanghai' CST+8:00:00 STD>),
   'gapS': 3434.315476,
   'gapH': 0.9539765211111112,
   'gapD': 0.03974902171296297,
   'exp': '?y-?m-?d-?H:+58M:00S'},
  {'pattern': '明年',
   'time': datetime.datetime(2019, 9, 11, 3, 51, tzinfo=<DstTzInfo 'Asia/Shanghai' CST+8:00:00 STD>),
   'gapS': 31535954.315328,
   'gapH': 8759.987309813332,
   'gapD': 364.99947124222217,
   'exp': '^1y-?m-?d-?H:?M:00S'}],
 'ml_label': 1,
 'ifAcceptInstallment': 'null'}

In [45]:
import re
token_pattern=r'\w{1,}|，|。|？'
re.findall(token_pattern,'我 是 他 朋友 ？')

['我', '是', '他', '朋友', '？']