In [1]:
#encoding=utf-8
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
# import lightgbm as lgb
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB


import pickle
import sys,os
from all_model_py import CutDebt, IDClassifier, IfKnowDebtor, Installment, WillingToPay, ConfirmLoan
sys.path.append('../../../Lib/')
from SUPPORT import balance_category

# load data

In [2]:
def get_other_data(df_non109,df_109,strategy_mat,classifier):
    possible_label = sorted(list(set(strategy_mat[strategy_mat[classifier]==0]['label'].values)))
    train_data_non109 = df_non109[df_non109['label'].apply(lambda x: x in possible_label)]
    train_data_109 = df_109[df_109[classifier]==0]
    data = pd.concat([train_data_non109,train_data_109],ignore_index=True,sort=True)
    return data

model_list = {'CutDebt':CutDebt,
              'IDClassifier':IDClassifier,
              'IfKnowDebtor':IfKnowDebtor,
              'Installment':Installment,
              'ConfirmLoan':ConfirmLoan,
              'WillingToPay':WillingToPay}

others_non109 = pd.read_csv('../../data/others/cleaned_mock_up_data_non109.csv')
others_109 = pd.read_csv('../../data/others/cleaned_mock_up_data_109.csv')
other_matrix = pd.read_csv('../../data/others/strategy_mat.csv')
target = 'label'
save_path = '../../savedModel/{}/'
for model in model_list:
    
    df = pd.read_csv('../../data/{}/cleaned_mock_up_data.csv'.format(model))
    other_label = int(max(set(df.label)) + 1)
    print('{} :other label is {}'.format(model,other_label))
    # filter out other label
    # get availabel other labels
    ava_others = get_other_data(others_non109,others_109,other_matrix,model)
    ava_others = ava_others[['text','label']]
    ava_others = ava_others.rename({'text':'split_text'},axis=1)
    ava_others['label'] = other_label
    df = pd.concat([df,ava_others],sort=True)
    
    df = df.sample(frac=1,random_state=6).reset_index(drop=True)
    print('=====  {} ======='.format(model))
    print(df.label.value_counts())
    
#     #################################### enable balancing
#     print('enable balancing')
#     df = balance_category(df,target='label')
#     df = df.sample(frac=1).reset_index(drop=True)
#     print(df.label.value_counts())
#     print('!!!!!!!!!!!!!!!!!!')


    print('begin training!')
    
    # get tfidf
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer.fit(df.split_text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(df.split_text)
    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, df.label)
    
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, df.label)
    
    
    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, df.label)
    print('finish training')
    
    other = pickle.load(open('../../savedModel/others/{}/{}_other.pickle'.format(model,model),'rb'))
    result = model_list[model](svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer, other=other,  jieba_path='../WordCut/userdict.txt')
    pickle.dump(result, open(save_path.format(model) + model + '.pickle', "wb"))
    
    


CutDebt :other label is 2
2    4698
0    3623
1    2327
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


finish training


Loading model cost 0.751 seconds.
Prefix dict has been built succesfully.


Time Zone is set from ENV: Asia/Shanghai
IDClassifier :other label is 2
2    3486
1    1234
0    1133
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
IfKnowDebtor :other label is 2
2    3468
0    1363
1    1184
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
Installment :other label is 2
2    4703
0    3623
1    2379
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
Time Zone is set from ENV: Asia/Shanghai
ConfirmLoan :other label is 2
2    3345
0     713
1     548
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
Time Zone is set from ENV: Asia/Shanghai
WillingToPay :other label is 3
3    4576
1    2620
0    1778
2    1038
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
finish training
Time Zone is set from ENV: Asia/Shanghai


In [3]:
idc = pickle.load(open("../../savedModel/IDClassifier/IDClassifier.pickle", 'rb'))
# idc.warm_up()
cutd = pickle.load(open("../../savedModel/CutDebt/CutDebt.pickle", 'rb'))
# cutd.warm_up()
ifk = pickle.load(open("../../savedModel/IfKnowDebtor/IfKnowDebtor.pickle", 'rb'))
# ifk.warm_up()
will = pickle.load(open("../../savedModel/WillingToPay/WillingToPay.pickle", 'rb'))
# will.warm_up()
inst = pickle.load(open("../../savedModel/Installment/Installment.pickle", 'rb'))
# inst.warm_up()
conf = pickle.load(open("../../savedModel/ConfirmLoan/ConfirmLoan.pickle", 'rb'))
# conf.warm_up()

In [4]:
import jieba
jieba_path='../WordCut/userdict.txt'
jieba.load_userdict(jieba_path)


In [5]:
sentence = '我草'
sentence = jieba.cut(sentence, cut_all = False)
sentence = ' '.join(sentence)
sentence

'我草'

In [6]:
idc.classify('我日你大爷')

2018-07-17 19:25:31,555 - DEBUG - CLASS:IDClassifier- METHOD:classify -LINE:91 - MSG:In transfered tfidf, the number of words in vocalbulary is: 7
2018-07-17 19:25:31,564 - DEBUG - CLASS:IDClassifier_other- METHOD:classify -LINE:54 - MSG:In transfered tfidf, the number of words in vocalbulary is: 7
2018-07-17 19:25:31,569 - DEBUG - CLASS:IDClassifier_other- METHOD:classify -LINE:68 - MSG:Possible labels are: [101, 103, 104, 107, 109]
2018-07-17 19:25:31,570 - DEBUG - CLASS:IDClassifier_other- METHOD:classify -LINE:69 - MSG:Other- Final Pred label is: 109
2018-07-17 19:25:31,572 - DEBUG - CLASS:IDClassifier_other- METHOD:classify -LINE:70 - MSG:Other- svc,logistic,nb result:
 [[0.01229204 0.00878715 0.03084999 0.02068768 0.92738315]
 [0.04567845 0.07070934 0.11218043 0.08641978 0.68501199]
 [0.02678163 0.0659326  0.07371182 0.04837054 0.78520341]]
2018-07-17 19:25:31,575 - DEBUG - CLASS:IDClassifier_other- METHOD:classify -LINE:71 - MSG:Other- ave result:
 [0.0282507  0.04847636 0.07224

{'label': 109, 'pred_prob': array([[0.02966849, 0.06300469, 0.90732683],
        [0.08315464, 0.12056887, 0.79627649],
        [0.07769235, 0.16328557, 0.75902208]]), 'av_pred': array([0.06350516, 0.11561971, 0.82087513])}

In [7]:
ifk.classify('我日')

2018-07-17 19:25:35,739 - DEBUG - CLASS:IfKnowDebtor- METHOD:classify -LINE:133 - MSG:In transfered tfidf, the number of words in vocalbulary is: 2
2018-07-17 19:25:35,748 - DEBUG - CLASS:IfKnowDebtor_other- METHOD:classify -LINE:54 - MSG:In transfered tfidf, the number of words in vocalbulary is: 2
2018-07-17 19:25:35,757 - DEBUG - CLASS:IfKnowDebtor_other- METHOD:classify -LINE:68 - MSG:Possible labels are: [101, 103, 104, 107, 109]
2018-07-17 19:25:35,763 - DEBUG - CLASS:IfKnowDebtor_other- METHOD:classify -LINE:69 - MSG:Other- Final Pred label is: 109
2018-07-17 19:25:35,766 - DEBUG - CLASS:IfKnowDebtor_other- METHOD:classify -LINE:70 - MSG:Other- svc,logistic,nb result:
 [[0.0129794  0.00927091 0.17834439 0.02584743 0.77355787]
 [0.04054026 0.06677474 0.2481428  0.06135361 0.58318859]
 [0.03837694 0.11028452 0.16092267 0.07497266 0.61544321]]
2018-07-17 19:25:35,770 - DEBUG - CLASS:IfKnowDebtor_other- METHOD:classify -LINE:71 - MSG:Other- ave result:
 [0.0306322  0.06211006 0.1958

{'label': 109, 'pred_prob': array([[0.0878981 , 0.03684717, 0.87525473],
        [0.18791168, 0.06799357, 0.74409475],
        [0.17459152, 0.09840418, 0.72700431]]), 'av_pred': array([0.15013377, 0.06774831, 0.78211793])}

In [8]:
conf.classify('我只借过5000')

2018-07-17 19:25:36,727 - DEBUG - CLASS:ConfirmLoan- METHOD:_ext_time -LINE:55 - MSG:No time was extracted!
2018-07-17 19:25:36,737 - DEBUG - CLASS:ConfirmLoan- METHOD:classify -LINE:183 - MSG:In transfered tfidf, the number of words in vocalbulary is: 7
2018-07-17 19:25:36,741 - DEBUG - CLASS:ConfirmLoan- METHOD:classify -LINE:205 - MSG:Final Pred label is: 1
2018-07-17 19:25:36,745 - DEBUG - CLASS:ConfirmLoan- METHOD:classify -LINE:206 - MSG:svc,logistic,nb result:
 [[0.00418647 0.85258438 0.14322914]
 [0.0697489  0.39089623 0.53935487]
 [0.09625575 0.56770718 0.33603707]]
2018-07-17 19:25:36,749 - DEBUG - CLASS:ConfirmLoan- METHOD:classify -LINE:207 - MSG:ave result:
 [0.05673038 0.60372926 0.33954036]


{'label': 1, 'pred_prob': array([[0.00418647, 0.85258438, 0.14322914],
        [0.0697489 , 0.39089623, 0.53935487],
        [0.09625575, 0.56770718, 0.33603707]]), 'av_pred': array([0.05673038, 0.60372926, 0.33954036]), 'time_extract': []}

In [10]:
will.classify('我下个星期三还不行，今天下午可以')

2018-07-17 19:25:54,601 - DEBUG - CLASS:WillingToPay- METHOD:_ext_time -LINE:58 - MSG:More than 2 times were extracted!
2018-07-17 19:25:54,603 - DEBUG - CLASS:WillingToPay- METHOD:classify -LINE:248 - MSG:There are more than 1 time extracted. And the min -0.43183358 is shorter than lower bounder! The output label is set to 10!


{'label': 10,
 'pred_prob': 1.0,
 'av_pred': 1.0,
 'time_extract': [{'pattern': '下个星期三',
   'time': datetime.datetime(2018, 7, 25, 19, 0, tzinfo=<DstTzInfo 'America/New_York' EDT-1 day, 20:00:00 DST>),
   'gapS': 689645.399112,
   'gapH': 191.56816642},
  {'pattern': '今天下午',
   'time': datetime.datetime(2018, 7, 17, 19, 0, tzinfo=<DstTzInfo 'America/New_York' EDT-1 day, 20:00:00 DST>),
   'gapS': -1554.600888,
   'gapH': -0.43183358}]}

In [18]:
import jieba
import numpy as np
import sys,os
tpattern_path = '../TimePattern/'
others_pth = '../Others/'
ENV_PATH = '../../../ENV/'
LOG_PATH = '../../../Lib/'

sys.path.append( tpattern_path)
from  time_pattern import TimePattern
t = TimePattern(pattern_path=tpattern_path+'mapping.csv')

In [21]:
t.process('下周周三')

[{'pattern': '下周',
  'time': datetime.datetime(2018, 7, 24, 16, 0, tzinfo=<DstTzInfo 'America/New_York' EDT-1 day, 20:00:00 DST>),
  'gapS': 595674.080177,
  'gapH': 165.4650222713889},
 {'pattern': '周三',
  'time': datetime.datetime(2018, 7, 18, 16, 0, tzinfo=<DstTzInfo 'America/New_York' EDT-1 day, 20:00:00 DST>),
  'gapS': 77274.080177,
  'gapH': 21.46502227138889}]

In [20]:
t.process('周三')

[{'pattern': '周三',
  'time': datetime.datetime(2018, 7, 18, 16, 0, tzinfo=<DstTzInfo 'America/New_York' EDT-1 day, 20:00:00 DST>),
  'gapS': 77279.756818,
  'gapH': 21.46659911611111}]