In [1]:
#encoding=utf-8
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [2]:
# read files
path = '../../data/WillingToPay/'
data = pd.read_csv(path + 'cleaned_mock_up_data.csv', encoding='utf8')
print(data.shape)
data.head(10)

(4215, 2)


Unnamed: 0,label,split_text
0,3,早上好 。 。 挂断 电话
1,2,手里 有点 紧 你 还 少 吗 ？
2,2,可以 少 还 点 不
3,1,没有 钱 就是 金钱
4,3,我 开车 我 等 了 一会儿
5,2,它 可以 更 少 ？
6,1,就 不 还
7,0,对不起 延迟 。 等到 你 得到 它 。
8,3,什么 公司 ？ 多少 钱 ？ 再说 一遍
9,3,听 不到 你 说 的话


In [3]:
data.split_text.iloc[0]

'早上好 。   。 挂断 电话'

In [4]:
phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

print('fitting phrase')
phrase_vectorizer.fit(data.split_text)

print('transform phrase')
phrase = phrase_vectorizer.transform(data.split_text)

phrase

fitting phrase
transform phrase


<4215x12416 sparse matrix of type '<class 'numpy.float64'>'
	with 44311 stored elements in Compressed Sparse Row format>

# SVC

In [5]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC

l_svc = LinearSVC()
clf = CalibratedClassifierCV(l_svc) 
clf.fit(phrase, data.label)
print(clf.score(phrase, data.label))

0.978410438909


In [6]:
print(clf.predict(phrase))

[3 2 2 ..., 3 1 0]


# logistic

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
log_r = LogisticRegression()
log_r.fit(phrase, data.label)
print(log_r.score(phrase, data.label))

0.949940688019


In [9]:
print(log_r.predict(phrase))

[3 2 2 ..., 3 1 0]


# LightGBM

In [10]:
import lightgbm as lgb

In [11]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data.label.values)
onelabels = le.transform(data.label.values)

In [12]:
onelabels

array([3, 2, 2, ..., 3, 1, 0])

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
multicoder = MultiLabelBinarizer()
lables = multicoder.fit_transform([data.label.values])

In [14]:
lables

array([[1, 1, 1, 1]])

In [15]:
data.label.values

array([3, 2, 2, ..., 3, 1, 0])

In [16]:
params = {
    'learning_rate': 0.2,
    'num_iterations':1000,
    'application': 'multiclassova',
    'num_class': 4,
    'num_leaves': 31,
    'verbosity': -1,
    'metric': 'multi_error',
    'data_random_seed': 2,
#     'bagging_fraction': 0.8,
#     'feature_fraction': 0.6,
    'nthread': 4,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'early_stopping_round':200
} 

# lgbm_train = lgb.Dataset(phrase, data.label)
lgbm_train = lgb.Dataset(phrase, onelabels)
lgbm_val = lgb.Dataset(phrase, onelabels)
lgbm_model = lgb.train(params,lgbm_train,valid_sets=lgbm_val, verbose_eval=5)



Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.242467
[10]	valid_0's multi_error: 0.219929
[15]	valid_0's multi_error: 0.197628
[20]	valid_0's multi_error: 0.181495
[25]	valid_0's multi_error: 0.174852
[30]	valid_0's multi_error: 0.164176
[35]	valid_0's multi_error: 0.159905
[40]	valid_0's multi_error: 0.150652
[45]	valid_0's multi_error: 0.144958
[50]	valid_0's multi_error: 0.139027
[55]	valid_0's multi_error: 0.131673
[60]	valid_0's multi_error: 0.128826
[65]	valid_0's multi_error: 0.125741
[70]	valid_0's multi_error: 0.119336
[75]	valid_0's multi_error: 0.116726
[80]	valid_0's multi_error: 0.114116
[85]	valid_0's multi_error: 0.113405
[90]	valid_0's multi_error: 0.107948
[95]	valid_0's multi_error: 0.103677
[100]	valid_0's multi_error: 0.101779
[105]	valid_0's multi_error: 0.100593
[110]	valid_0's multi_error: 0.0986951
[115]	valid_0's multi_error: 0.0982206
[120]	valid_0's multi_error: 0.0951364
[125]	valid_0's multi_error: 0.0951364
[13

In [17]:
# print(lgbm_model.predict(phrase))

In [18]:
# lgbm_model.predict(phrase)

# Save

In [19]:
import pickle

# save tfidf
pickle.dump(phrase_vectorizer, open("../../savedModel/WillingToPay/tfidf.pickle", "wb"))
# pickle.dump(train_comment_features, open("train_comment_features.pickle", "wb"))
# pickle.dump(test_comment_features, open("test_comment_features.pickle", "wb"))

# save linear svc
pickle.dump(clf, open("../../savedModel/WillingToPay/LinearSVC.pickle", "wb"))
# save logistic
pickle.dump(log_r, open("../../savedModel/WillingToPay/Logistic.pickle", "wb"))
# save lightGBM
pickle.dump(lgbm_model, open("../../savedModel/WillingToPay/Lgbm.pickle", "wb"))

# Test

In [36]:
import jieba
sentence = '天天催，给你说了不行'
sentence = jieba.cut(sentence, cut_all = False)
sentence = ' '.join(sentence)
test = phrase_vectorizer.transform([sentence])
# test = phrase_vectorizer.transform(['我 在 洗澡'])


In [37]:
clf.predict_proba(test) # linear svc

array([[ 0.19754589,  0.1176852 ,  0.00875102,  0.6760179 ]])

In [38]:
log_r.predict_proba(test) # logistic

array([[ 0.1637917 ,  0.26416769,  0.06534159,  0.50669901]])

In [39]:
lgbm_model.predict(test) # light gbm

array([[  1.32569881e-02,   1.49850602e-01,   3.52224772e-04,
          9.30175295e-01]])

# labeling

In [24]:
# basic logic: find the max probability of 3 models, if it is larger than threshold, return the corresponding label, otherwise, return 2 (others).
result = np.vstack((clf.predict_proba(test),log_r.predict_proba(test),lgbm_model.predict(test)))
pos = np.where(result == np.max(result))

threshold = 0.7
if np.max(result)<threshold:
    label = 2
else:
    label = pos[1]
    label = label[0]
    
print('label=',label)
print('prob=',np.max(result))

label= 1
prob= 0.980752491219


# save the whole thing

In [25]:
# import jieba
# import numpy as np

# class WillingToPay:
    
#     def __init__(self, **model):
#         """
#         suggested parameters:
#         svc, logistic, lightgbm, jieba_path,tfidf
#         """
#         self._load_model(**model)
        
#     def _load_model(self,**model):
#         self.svc = model.get('svc')
#         self.logistic = model.get('logistic')
#         self.lightgbm = model.get('lightgbm')
#         self.tfidf = model.get('tfidf')
#         # load jieba
#         jieba_path = model.get('jieba_path')
#         if jieba_path is not None:
#             jieba.load_userdict(jieba_path)
        
        
#     def classify(self, sentence):
#         sentence = jieba.cut(sentence, cut_all = False)
#         sentence = ' '.join(sentence)
#         matrix = self.tfidf.transform([sentence])
#         result = np.vstack((self.svc.predict_proba(matrix),
#                             self.logistic.predict_proba(matrix),
#                             self.lightgbm.predict(matrix)))
#         max_pred = np.max(result, axis=0)
#         max_arg = np.argmax(max_pred)
#         threshold = 0.6
#         if np.max(max_pred)<threshold:
#             label = 3
#         else:
#             label = max_arg
#         return (label, np.max(max_pred))
     
        

In [26]:
%load_ext autoreload
%autoreload 2

In [27]:
from WillingToPay_py import WillingToPay
ifk = WillingToPay(svc=clf, logistic=log_r, lightgbm=lgbm_model, tfidf=phrase_vectorizer, jieba_path='../WordCut/userdict.txt')

pickle.dump(ifk, open("../../savedModel/WillingToPay/WillingToPay.pickle", "wb"))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.454 seconds.
Prefix dict has been built succesfully.
