In [8]:
#encoding=utf-8
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# import lightgbm as lgb
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB

import lightgbm as lgb
import pickle
import sys,os
from all_model_py import CutDebt, IDClassifier, IfKnowDebtor, Installment, WillingToPay, ConfirmLoan

sys.path.append('../../../Lib/')
from model_matrix import eval_mat

In [9]:
def sub_df(df,sets,target='label'):
    result = pd.DataFrame()
    for each in sets:
        result = pd.concat([result,df[df[target]==each]])
#     print(result[target].value_counts())
    return result

In [10]:
others = pd.read_csv('../../data/others/cleaned_mock_up_data.csv')
other_matrix = pd.read_csv('../../data/others/strategy_mat.csv')
target = 'label'
save_path = '../../savedModel/{}/'

# CutDebt

In [13]:
model = 'CutDebt'
df = pd.read_csv('../../data/{}/cleaned_mock_up_data.csv'.format(model))
other_label = max(set(df.label))
# filter out other label
df = df[df.label != other_label]
# get availabel other labels
other_set = set(other_matrix[other_matrix[model]==0].label.values)
ava_others = sub_df(others,other_set)
ava_others[target] = other_label
ava_others = ava_others.rename({'text':'split_text'},axis=1)
df = pd.concat([df,ava_others],sort=True)
# df = df.sample(frac=1,).reset_index(drop=True)
print('=====  {} ======='.format(model))
print(df.label.value_counts())
print('begin training!')
train,val = train_test_split(df,test_size=0.3,train_size=0.7,random_state=19)



# get tfidf
phrase_vectorizer = TfidfVectorizer(
                                ngram_range=(1,3),
                                strip_accents='unicode', 
                                max_features=100000, 
                                analyzer='word',
                                sublinear_tf=True,
                                use_idf=True,
                                norm='l2',
                                token_pattern=r'\w{1,}')

print('fitting phrase')
phrase_vectorizer.fit(train.split_text)

print('transform phrase')
phrase_train = phrase_vectorizer.transform(train.split_text)
phrase_val = phrase_vectorizer.transform(val.split_text)


        

l_svc = LinearSVC(C=1)
lsvc = CalibratedClassifierCV(l_svc) 
lsvc.fit(phrase_train, train.label)
val_pred = lsvc.predict(phrase_val)
evl = eval_mat(val.label.values, val_pred)
print('======== Linear SVC =======')
print(evl)


# logistic
log_r = LogisticRegression()
log_r.fit(phrase_train, train.label)
val_pred = log_r.predict(phrase_val)
evl = eval_mat(val.label.values, val_pred)
print('======== logistic =======')
print(evl)


# Naive Bayes
naive_b = MultinomialNB()
naive_b.fit(phrase_train, train.label)
val_pred = naive_b.predict(phrase_val)
evl = eval_mat(val.label.values, val_pred)
print('======== Naive Bayes =======')
print(evl)

# SVM
svm = SVC(kernel='linear')
svm.fit(phrase_train, train.label)
val_pred = svm.predict(phrase_val)
evl = eval_mat(val.label.values, val_pred)
print('======== SVM =======')
print(evl)


rf = RandomForestClassifier()
rf.fit(phrase_train, train.label)
val_pred = rf.predict(phrase_val)
evl = eval_mat(val.label.values, val_pred)
print('======== Random Forest =======')
print(evl)





0    4105
2    2819
1    2144
Name: label, dtype: int64
begin training!
fitting phrase
transform phrase
                pred_0      pred_1      pred_2    recall
actual_0   1087.000000   52.000000   99.000000  0.878029
actual_1     78.000000  500.000000   75.000000  0.765697
actual_2     95.000000   94.000000  641.000000  0.772289
precision     0.862698    0.773994    0.786503  0.818817
                pred_0      pred_1      pred_2    recall
actual_0   1072.000000   41.000000  125.000000  0.865913
actual_1    105.000000  458.000000   90.000000  0.701378
actual_2    104.000000   72.000000  654.000000  0.787952
precision     0.836846    0.802102    0.752589  0.802646
                pred_0      pred_1      pred_2    recall
actual_0   1147.000000   35.000000   56.000000  0.926494
actual_1    135.000000  454.000000   64.000000  0.695253
actual_2    176.000000   59.000000  595.000000  0.716867
precision     0.786694    0.828467    0.832168  0.807056
                pred_0      pred_1      p

In [None]:
======== Linear SVC =======
                pred_0      pred_1      pred_2    recall
actual_0   1092.000000   63.000000  104.000000  0.867355
actual_1     57.000000  496.000000   79.000000  0.784810
actual_2     88.000000   94.000000  648.000000  0.780723
precision     0.882781    0.759571    0.779783  0.821757
======== logistic =======
                pred_0      pred_1      pred_2    recall
actual_0   1093.000000   52.000000  114.000000  0.868149
actual_1     85.000000  457.000000   90.000000  0.723101
actual_2     95.000000   74.000000  661.000000  0.796386
precision     0.858602    0.783877    0.764162  0.812569
======== Naive Bayes =======
                pred_0      pred_1   pred_2    recall
actual_0   1163.000000   37.000000   59.000  0.923749
actual_1    124.000000  441.000000   67.000  0.697785
actual_2    180.000000   56.000000  594.000  0.715663
precision     0.792774    0.825843    0.825  0.807791
======== SVM =======
                pred_0     pred_1      pred_2    recall
actual_0   1093.000000   60.00000  106.000000  0.868149
actual_1     62.000000  492.00000   78.000000  0.778481
actual_2     93.000000   88.00000  649.000000  0.781928
precision     0.875801    0.76875    0.779112  0.821022
======== Random Forest =======
                pred_0      pred_1      pred_2    recall
actual_0   1148.000000   40.000000   71.000000  0.911835
actual_1    108.000000  435.000000   89.000000  0.688291
actual_2    192.000000   87.000000  551.000000  0.663855
precision     0.792818    0.774021    0.774965  0.784270

In [None]:
# params={'task':'train','objective':'multiclass','num_class':3,}

# train_set = lgb.Dataset(phrase_train,train.label.values)
# model = lgb.train(params=params,train_set=train_set)
# val_pred = model.predict(phrase_val)
# val_pred = np.argmax(val_pred,axis=1)
# evl = eval_mat(val.label.values, val_pred)
# print('======== lightgbm =======')
# print(evl)