In [2]:
#encoding=utf-8
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# import lightgbm as lgb
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB

import lightgbm as lgb
import pickle
import sys,os
sys.path.append('../../MLModel/code/OneClickTraining/')
from all_model_py import CutDebt, IDClassifier, IfKnowDebtor, Installment, WillingToPay, ConfirmLoan

sys.path.append('../../Lib/')
from model_matrix import eval_mat
from SUPPORT import balance_category

In [3]:
def sub_df(df,sets,target='label'):
    result = pd.DataFrame()
    for each in sets:
        result = pd.concat([result,df[df[target]==each]])
#     print(result[target].value_counts())
    return result

In [4]:
others = pd.read_csv('../../MLModel/data/others/cleaned_mock_up_data.csv')
other_matrix = pd.read_csv('../../MLModel/data/others/strategy_mat.csv')
target = 'label'
save_path = '../../MLModel/savedModel/{}/'

# CutDebt

In [11]:

model_list = {'CutDebt':CutDebt,
              'IDClassifier':IDClassifier,
              'IfKnowDebtor':IfKnowDebtor,
              'Installment':Installment,
              'ConfirmLoan':ConfirmLoan,
              'WillingToPay':WillingToPay}

In [15]:
model = 'ConfirmLoan'
df = pd.read_csv('../../MLModel/data/{}/cleaned_mock_up_data.csv'.format(model))
other_label = int(max(set(df.label)) + 1)
# filter out other label

# get availabel other labels
other_set = set(other_matrix[other_matrix[model]==0].label.values)
ava_others = sub_df(others,other_set)
ava_others[target] = other_label
ava_others = ava_others.rename({'text':'split_text'},axis=1)
df = pd.concat([df,ava_others],sort=True)
df = balance_category(df,target='label')
df = df.sample(frac=1,random_state=21).reset_index(drop=True)

print('=====  {} ======='.format(model))
print(df.label.value_counts())
print('begin training!')
train,val = train_test_split(df,test_size=0.3,train_size=0.7,random_state=19)



# get tfidf
phrase_vectorizer = TfidfVectorizer(
                                ngram_range=(1,3),
                                strip_accents='unicode', 
                                max_features=100000, 
                                analyzer='word',
                                sublinear_tf=True,
                                use_idf=True,
                                norm='l2',
                                token_pattern=r'\w{1,}')

print('fitting phrase')
phrase_vectorizer.fit(train.split_text)

print('transform phrase')
phrase_train = phrase_vectorizer.transform(train.split_text)
phrase_val = phrase_vectorizer.transform(val.split_text)


        

l_svc = LinearSVC(C=1)
lsvc = CalibratedClassifierCV(l_svc) 
lsvc.fit(phrase_train, train.label)
val_pred = lsvc.predict(phrase_val)
evl = eval_mat(val.label.values, val_pred)
print('======== Linear SVC =======')
print(evl)


# logistic
log_r = LogisticRegression()
log_r.fit(phrase_train, train.label)
val_pred = log_r.predict(phrase_val)
evl = eval_mat(val.label.values, val_pred)
print('======== logistic =======')
print(evl)


# Naive Bayes
naive_b = MultinomialNB()
naive_b.fit(phrase_train, train.label)
val_pred = naive_b.predict(phrase_val)
evl = eval_mat(val.label.values, val_pred)
print('======== Naive Bayes =======')
print(evl)

# SVM
svm = SVC(kernel='linear')
svm.fit(phrase_train, train.label)
val_pred = svm.predict(phrase_val)
evl = eval_mat(val.label.values, val_pred)
print('======== SVM =======')
print(evl)


rf = RandomForestClassifier()
rf.fit(phrase_train, train.label)
val_pred = rf.predict(phrase_val)
evl = eval_mat(val.label.values, val_pred)
print('======== Random Forest =======')
print(evl)





2    4280
0    4278
1    3836
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
                pred_0      pred_1       pred_2    recall
actual_0   1258.000000     0.00000     5.000000  0.996041
actual_1      0.000000  1129.00000     8.000000  0.992964
actual_2     38.000000    39.00000  1242.000000  0.941622
precision     0.970679     0.96661     0.989641  0.975800
                pred_0      pred_1       pred_2    recall
actual_0   1241.000000     4.00000    18.000000  0.982581
actual_1      0.000000  1117.00000    20.000000  0.982410
actual_2     72.000000    55.00000  1192.000000  0.903715
precision     0.945164     0.94983     0.969106  0.954558
                pred_0       pred_1       pred_2    recall
actual_0   1240.000000    17.000000     6.000000  0.981789
actual_1      0.000000  1124.000000    13.000000  0.988566
actual_2    105.000000   103.000000  1111.000000  0.842305
precision     0.921933     0.903537     0.983186  0.934391
                pred_

In [None]:
======== Linear SVC =======
               pred_0      pred_1      pred_2    recall
actual_0   192.000000   10.000000    11.00000  0.901408
actual_1     6.000000  116.000000    26.00000  0.783784
actual_2    49.000000   22.000000  1231.00000  0.945469
precision    0.777328    0.783784     0.97082  0.925436
======== logistic =======
               pred_0     pred_1      pred_2    recall
actual_0   141.000000   8.000000    64.00000  0.661972
actual_1     6.000000  69.000000    73.00000  0.466216
actual_2    16.000000   8.000000  1278.00000  0.981567
precision    0.865031   0.811765     0.90318  0.894768
======== Naive Bayes =======
           pred_0  pred_1       pred_2    recall
actual_0     78.0    2.00   133.000000  0.366197
actual_1      0.0   47.00   101.000000  0.317568
actual_2      0.0    1.00  1301.000000  0.999232
precision     1.0    0.94     0.847557  0.857486
======== SVM =======
               pred_0      pred_1       pred_2    recall
actual_0   185.000000    9.000000    19.000000  0.868545
actual_1     5.000000  107.000000    36.000000  0.722973
actual_2    41.000000   15.000000  1246.000000  0.956989
precision    0.800866    0.816794     0.957725  0.924835
======== Random Forest =======
           pred_0     pred_1       pred_2    recall
actual_0   162.00   6.000000    45.000000  0.760563
actual_1     9.00  93.000000    46.000000  0.628378
actual_2    29.00  12.000000  1261.000000  0.968510
precision    0.81   0.837838     0.932692  0.911606

In [None]:
# params={'task':'train','objective':'multiclass','num_class':3,}

# train_set = lgb.Dataset(phrase_train,train.label.values)
# model = lgb.train(params=params,train_set=train_set)
# val_pred = model.predict(phrase_val)
# val_pred = np.argmax(val_pred,axis=1)
# evl = eval_mat(val.label.values, val_pred)
# print('======== lightgbm =======')
# print(evl)

In [21]:
int('6006')

6006