In [1]:
#encoding=utf-8
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [2]:
# read files
path = '../../data/WillingToPay/'
data = pd.read_csv(path + 'cleaned_mock_up_data.csv', encoding='utf8')
print(data.shape)
data.head(10)

(17015, 2)


Unnamed: 0,label,split_text
0,3,40 分钟 后
1,1,舞弊
2,1,他 不再 说 了
3,1,过 了 一段时间 后 再说
4,1,在 月底
5,3,在 开会
6,0,我 等 下
7,1,你 犯 了 一个 错误
8,2,你 从 哪里 来
9,1,我 借 了


In [3]:
data.split_text.iloc[0]

'40 分钟 后'

In [4]:
phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

print('fitting phrase')
phrase_vectorizer.fit(data.split_text)

print('transform phrase')
phrase = phrase_vectorizer.transform(data.split_text)

phrase

fitting phrase
transform phrase


<17015x28155 sparse matrix of type '<class 'numpy.float64'>'
	with 149143 stored elements in Compressed Sparse Row format>

# SVC

In [5]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC

l_svc = LinearSVC()
clf = CalibratedClassifierCV(l_svc) 
clf.fit(phrase, data.label)
print(clf.score(phrase, data.label))

0.870349691449


In [6]:
print(clf.predict(phrase))

[3 1 1 ..., 1 1 3]


# logistic

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
log_r = LogisticRegression()
log_r.fit(phrase, data.label)
print(log_r.score(phrase, data.label))

0.826153394064


In [9]:
print(log_r.predict(phrase))

[3 1 1 ..., 1 1 3]


# LightGBM

In [10]:
import lightgbm as lgb

In [11]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data.label.values)
onelabels = le.transform(data.label.values)

In [12]:
onelabels

array([3, 1, 1, ..., 1, 1, 3])

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
multicoder = MultiLabelBinarizer()
lables = multicoder.fit_transform([data.label.values])

In [14]:
lables

array([[1, 1, 1, 1]])

In [15]:
data.label.values

array([3, 1, 1, ..., 1, 1, 3])

In [16]:
params = {
    'learning_rate': 0.2,
    'num_iterations':1000,
    'application': 'multiclassova',
    'num_class': 4,
    'num_leaves': 31,
    'verbosity': -1,
    'metric': 'multi_error',
    'data_random_seed': 2,
#     'bagging_fraction': 0.8,
#     'feature_fraction': 0.6,
    'nthread': 4,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'early_stopping_round':200
} 

# lgbm_train = lgb.Dataset(phrase, data.label)
lgbm_train = lgb.Dataset(phrase, onelabels)
lgbm_val = lgb.Dataset(phrase, onelabels)
lgbm_model = lgb.train(params,lgbm_train,valid_sets=lgbm_val, verbose_eval=5)



Training until validation scores don't improve for 200 rounds.
[5]	valid_0's multi_error: 0.362798
[10]	valid_0's multi_error: 0.345871
[15]	valid_0's multi_error: 0.334352
[20]	valid_0's multi_error: 0.321716
[25]	valid_0's multi_error: 0.310961
[30]	valid_0's multi_error: 0.298795
[35]	valid_0's multi_error: 0.29139
[40]	valid_0's multi_error: 0.283926
[45]	valid_0's multi_error: 0.277814
[50]	valid_0's multi_error: 0.271878
[55]	valid_0's multi_error: 0.265413
[60]	valid_0's multi_error: 0.261593
[65]	valid_0's multi_error: 0.256538
[70]	valid_0's multi_error: 0.251602
[75]	valid_0's multi_error: 0.248604
[80]	valid_0's multi_error: 0.244549
[85]	valid_0's multi_error: 0.242492
[90]	valid_0's multi_error: 0.239024
[95]	valid_0's multi_error: 0.236791
[100]	valid_0's multi_error: 0.23444
[105]	valid_0's multi_error: 0.232031
[110]	valid_0's multi_error: 0.229738
[115]	valid_0's multi_error: 0.226565
[120]	valid_0's multi_error: 0.224567
[125]	valid_0's multi_error: 0.222274
[130]	val

In [17]:
# print(lgbm_model.predict(phrase))

In [18]:
# lgbm_model.predict(phrase)

# Save

In [19]:
import pickle

# save tfidf
pickle.dump(phrase_vectorizer, open("../../savedModel/WillingToPay/tfidf.pickle", "wb"))
# pickle.dump(train_comment_features, open("train_comment_features.pickle", "wb"))
# pickle.dump(test_comment_features, open("test_comment_features.pickle", "wb"))

# save linear svc
pickle.dump(clf, open("../../savedModel/WillingToPay/LinearSVC.pickle", "wb"))
# save logistic
pickle.dump(log_r, open("../../savedModel/WillingToPay/Logistic.pickle", "wb"))
# save lightGBM
pickle.dump(lgbm_model, open("../../savedModel/WillingToPay/Lgbm.pickle", "wb"))

# Test

In [20]:
phrase.data

array([ 0.272501  ,  0.43671395,  0.40356772, ...,  0.49316411,
        0.16823195,  0.32154549])

In [17]:
import jieba
sentence = 'm'
sentence = jieba.cut(sentence, cut_all = False)

In [23]:
import jieba
sentence = '可以少还一些吗'
sentence = jieba.cut(sentence, cut_all = False)
sentence = ' '.join(sentence)
test = phrase_vectorizer.transform([sentence])
# test = phrase_vectorizer.transform(['我 在 洗澡'])


In [24]:
clf.predict_proba(test) # linear svc

array([[ 0.02758168,  0.0178142 ,  0.93583117,  0.01877295]])

In [25]:
log_r.predict_proba(test) # logistic

array([[ 0.0839188 ,  0.02898682,  0.8559837 ,  0.03111067]])

In [26]:
lgbm_model.predict(test) # light gbm

array([[  8.14762223e-03,   4.99286239e-04,   9.90118024e-01,
          2.73465392e-05]])

# labeling

In [24]:
# basic logic: find the max probability of 3 models, if it is larger than threshold, return the corresponding label, otherwise, return 2 (others).
result = np.vstack((clf.predict_proba(test),log_r.predict_proba(test),lgbm_model.predict(test)))
pos = np.where(result == np.max(result))

threshold = 0.7
if np.max(result)<threshold:
    label = 2
else:
    label = pos[1]
    label = label[0]
    
print('label=',label)
print('prob=',np.max(result))

label= 1
prob= 0.980752491219


# save the whole thing

In [21]:
# import jieba
# import numpy as np

# class WillingToPay:
    
#     def __init__(self, **model):
#         """
#         suggested parameters:
#         svc, logistic, lightgbm, jieba_path,tfidf
#         """
#         self._load_model(**model)
        
#     def _load_model(self,**model):
#         self.svc = model.get('svc')
#         self.logistic = model.get('logistic')
#         self.lightgbm = model.get('lightgbm')
#         self.tfidf = model.get('tfidf')
#         # load jieba
#         jieba_path = model.get('jieba_path')
#         if jieba_path is not None:
#             jieba.load_userdict(jieba_path)
        
        
#     def classify(self, sentence):
#         sentence = jieba.cut(sentence, cut_all = False)
#         sentence = ' '.join(sentence)
#         matrix = self.tfidf.transform([sentence])
#         if len(matrix.data) > 0:
#             result = np.vstack((self.svc.predict_proba(matrix),
#                                 self.logistic.predict_proba(matrix),
#                                 self.lightgbm.predict(matrix)))
#         else:
#             result = np.vstack((self.svc.predict_proba(matrix),
#                                 self.logistic.predict_proba(matrix),
#                                 ))
#         max_pred = np.max(result, axis=0)
#         max_arg = np.argmax(max_pred)
#         threshold = 0.5
#         if np.max(max_pred)<threshold:
#             label = 3
#         else:
#             label = max_arg
#         return (label, [max_arg,np.max(max_pred)])
     
        

In [25]:
import pickle
clf = pickle.load(open("../../savedModel/WillingToPay/LinearSVC.pickle", "rb"))
lgbm_model = pickle.load(open("../../savedModel/WillingToPay/Lgbm.pickle", "rb"))
phrase_vectorizer = pickle.load(open("../../savedModel/WillingToPay/tfidf.pickle", "rb"))
log_r = pickle.load(open("../../savedModel/WillingToPay/Logistic.pickle", "rb"))

In [26]:
from WillingToPay_py import WillingToPay
ifk = WillingToPay(svc=clf, logistic=log_r, lightgbm=lgbm_model, tfidf=phrase_vectorizer,) #jieba_path='../WordCut/userdict.txt')

pickle.dump(ifk, open("../../savedModel/WillingToPay/WillingToPay.pickle", "wb"))