In [2]:
import xlrd
from collections import OrderedDict
import json

In [3]:
import os.path
import numpy as np
import pandas as pd

In [4]:
import jieba
import jieba.analyse
import cPickle

In [5]:
import re

In [5]:
jieba.set_dictionary("dict.txt.big")
stop_words_path = "stop_words.txt"
jieba.analyse.set_stop_words(stop_words_path)

In [8]:
# excel to json
def excel_to_json(excel_file_name):
    wb = xlrd.open_workbook(excel_file_name)
    sh = wb.sheet_by_index(0)
    
    message_all = OrderedDict()
    message_all['X'] = []
    message_all['y'] = []
    
    for rownum in range(1, sh.nrows):
        row_values = sh.row_values(rownum)
        msg = row_values[2]
        label = row_values[3]
        if msg != '' and label != '':
            message_all['X'].append(msg)
            message_all['y'].append(label)
    
    return message_all

In [9]:
def get_class_keyword(label, message_all):
    msg_label = np.array(message_all['X'])[np.array(message_all['y'])==label]
    combined_msg = ''
    for m in msg_label:
        msg_without_url = re.sub("(?P<url>https?://[^\s]+)",'', m)    
        combined_msg += msg_without_url
    class_keyword = jieba.analyse.extract_tags(combined_msg, 200)
    return class_keyword

# 製作特徵向量

In [10]:
class TextMining:
    def __init__(self):
        self.keyword_dict = None
    def get_keyword_dict(self,refresh=False):
        if self.keyword_dict == None or refresh == True:
            file_name = 'keyword_2.json'
            if os.path.isfile(file_name):
                with open(file_name,'r') as infile:
                    keyword_file = json.load(infile, object_pairs_hook=OrderedDict)
                self.keyword_dict = keyword_file
            else:
                print file_name+" doesn't exist"
                return 
        return self.keyword_dict
    def _make_ML_X(self,msg_list,print_out=False):
        key_dict = self.get_keyword_dict()
        ML_X = []
        for msg in msg_list:
            tmp = []
            has_url = 1 if len(re.findall("(?P<url>https?://[^\s]+)", msg))>0 else 0
            msg_len = len(msg)
            tmp.append(has_url)
            tmp.append(msg_len)
            if print_out:
                print 'has_url : ',has_url
                print 'msg_len : ',msg_len
            for k in key_dict:
                message_jieba = jieba.analyse.extract_tags(msg,0)
                match = list(set(message_jieba).intersection(key_dict[k]))
                if print_out:
                    print 'match '+k+' : ',','.join(match)
                tmp.append(len(match))
            ML_X.append(tmp)
        return ML_X

# 分類器 with different algorithm

In [11]:
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [12]:
class Classifier:
    def __init__(self, X_train, y_train, algorithm):
        self.alg = algorithm
        self.X_train = X_train
        self.y_train = y_train
        self.model = None
    def _predict(self, test_X):
        return self.get_model().predict(test_X)
    def get_model(self):
        if self.model == None :
            if self.alg == "lg":
                clf = LogisticRegression()
            elif self.alg == "tree":
                clf = tree.DecisionTreeClassifier()
            elif self.alg == "svm":
                clf = svm.SVC()
            else:
                print "wrong algorithm name : ", self.alg
                return
            self.model = clf.fit(self.X_train,self.y_train)
        return self.model
    def accuracy(self, X_test, y_test):
        model = self.get_model()
        if model is not None:
            prediction = model.predict(X_test)
            accuracy = np.mean(np.array(prediction)==np.array(y_test))
            correct_idx = np.where(np.array(prediction)==np.array(y_test))[0]
            incorrect_idx = np.where(np.array(prediction)!=np.array(y_test))[0]
            return accuracy, correct_idx, incorrect_idx
        else:
            print "model:None has no accuracy"
            return
    def _classification_report(self, X_test, y_test):
        model = self.get_model()
        class_names = ['class 0', 'class 1', 'class 2']
        if model is not None:
            prediction = model.predict(X_test)
            return classification_report(y_test, prediction, target_names=class_names)
        else:
            print "model:None has no classification_report"
            return

# 讀取資料

In [7]:
message_all = excel_to_json('2017_line.xlsx')

In [13]:
X = TextMining()._make_ML_X(message_all['X'],print_out=False)

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/q1/sy8zj9z12v792k86g84xby1r0000gn/T/jieba.cache
Loading model cost 1.890 seconds.
Prefix dict has been built succesfully.


In [14]:
df = pd.DataFrame(X,columns = ['url','len','obj_topic','push','action','sub_emotion','sub_topic'])

In [27]:
# Line訊息
print "class : %s" % message_all['y'][0]
print "message : %s" % message_all['X'][0]

class : 1.0
message : 7-11圖片條碼換大熱拿鐵，一支手機一次，家裡有幾支手機就可以換幾杯拿鐵。
把這個圖片存下來，可以去7-11換一杯大熱拿
要開圖片給他刷條碼
2/28前要換掉

記住！是7-11唷


In [28]:
# 特徵向量
df[0:1]

Unnamed: 0,url,len,obj_topic,push,action,sub_emotion,sub_topic
0,0,90,1,0,1,0,0


# Imbalanced Data - Training/Testing dataset

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, message_all['y'], test_size=0.2, stratify=message_all['y'])

In [17]:
print np.bincount(y_train)/float(len(y_train)),len(y_train)
print np.bincount(y_test)/float(len(y_test)),len(y_test)

[ 0.60449438  0.11460674  0.28089888] 445
[ 0.60714286  0.11607143  0.27678571] 112


In [18]:
lg = LogisticRegression()
lg_model = lg.fit(X_train,y_train)
prediction = lg_model.predict(X_test)

### Imbalanced Data - Classifier evaluation

In [19]:
accuracy = np.mean(np.array(prediction)==np.array(y_test))
correct_idx = np.where(np.array(prediction)==np.array(y_test))[0]
incorrect_idx = np.where(np.array(prediction)!=np.array(y_test))[0]
print "accuracy:",accuracy
print "correct_idx:",correct_idx
print "incorrect_idx:",incorrect_idx 

accuracy: 0.883928571429
correct_idx: [  0   2   3   4   5   6   7   9  10  11  12  13  14  15  16  17  18  19
  20  21  22  23  24  25  26  27  30  31  32  34  35  36  37  38  39  40
  41  42  43  44  45  47  48  49  50  51  52  53  54  55  56  57  58  59
  60  61  62  63  64  65  66  67  69  70  71  72  73  74  75  76  77  78
  79  80  81  83  84  85  86  87  88  90  91  92  93  94  95  96  97  98
  99 101 103 104 105 106 108 110 111]
incorrect_idx: [  1   8  28  29  33  46  68  82  89 100 102 107 109]


In [20]:
target_names = ['class 0', 'class 1', 'class 2']
print classification_report(y_test, prediction, target_names=target_names)

             precision    recall  f1-score   support

    class 0       0.93      0.94      0.93        68
    class 1       0.82      0.69      0.75        13
    class 2       0.81      0.84      0.83        31

avg / total       0.88      0.88      0.88       112



# Balanced Data ( using under-sampling approach ) - Training/Testing dataset

In [21]:
from sklearn.utils import resample

In [22]:
np_X = np.array(X)
np_y = np.array([int(i) for i in message_all['y']])

In [40]:
X_0, y_0 = resample(np_X[np_y==0], np_y[np_y==0],n_samples=61,random_state=0)
X_1, y_1 = resample(np_X[np_y==1], np_y[np_y==1],n_samples=61,random_state=0)
X_2, y_2 = resample(np_X[np_y==2], np_y[np_y==2],n_samples=61,random_state=0)

In [41]:
X_balance = np.concatenate((X_0,X_1,X_2))
y_balance = np.concatenate((y_0,y_1,y_2))

In [42]:
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_balance, y_balance, test_size=0.2, stratify=y_balance)

In [43]:
print np.bincount(y_train_b)/float(len(y_train_b)),len(y_train_b)
print np.bincount(y_test_b)/float(len(y_test_b)),len(y_test_b)

[ 0.33561644  0.32876712  0.33561644] 146
[ 0.32432432  0.35135135  0.32432432] 37


### Balanced Data - Classifier Evaluation

In [44]:
clf_lg = Classifier(X_train_b,y_train_b,"lg")
accuracy_lg, correct_idx_lg,incorrect_idx_lg = clf_lg.accuracy(X_test_b,y_test_b)
clf_report_lg = clf_lg._classification_report(X_test_b,y_test_b)

In [45]:
clf_tree = Classifier(X_train_b,y_train_b,"tree")
accuracy_tree, correct_idx_tree,incorrect_idx_tree = clf_tree.accuracy(X_test_b,y_test_b)
clf_report_tree = clf_tree._classification_report(X_test_b,y_test_b)

In [46]:
clf_svm = Classifier(X_train_b,y_train_b,"svm")
accuracy_svm, correct_idx_svm,incorrect_idx_svm = clf_svm.accuracy(X_test_b,y_test_b)
clf_report_svm = clf_svm._classification_report(X_test_b,y_test_b)

In [47]:
print accuracy_lg
print clf_report_lg

0.945945945946
             precision    recall  f1-score   support

    class 0       0.86      1.00      0.92        12
    class 1       1.00      0.92      0.96        13
    class 2       1.00      0.92      0.96        12

avg / total       0.95      0.95      0.95        37



In [48]:
print accuracy_tree
print clf_report_tree

0.972972972973
             precision    recall  f1-score   support

    class 0       1.00      1.00      1.00        12
    class 1       0.93      1.00      0.96        13
    class 2       1.00      0.92      0.96        12

avg / total       0.97      0.97      0.97        37



In [49]:
print accuracy_svm
print clf_report_svm

0.756756756757
             precision    recall  f1-score   support

    class 0       1.00      1.00      1.00        12
    class 1       0.61      0.85      0.71        13
    class 2       0.71      0.42      0.53        12

avg / total       0.77      0.76      0.74        37



# Model persistence

In [173]:
import pickle

In [52]:
from sklearn.externals import joblib

In [53]:
joblib.dump(clf_lg.get_model(), 'classifier_lg_model.pkl') 

['classifier_lg_model.pkl']

In [50]:
with open('clf_lg_model_cPickle.pkl', 'wb') as fid:
    cPickle.dump(clf_lg.get_model(), fid) 

In [126]:
print ','.join(jieba.analyse.extract_tags(message_all['X'][190], 200))

qq,見效,weixin,mp,na321dZIYdaAoNGkoKG0ZA,無數,太棒了,方子,救人,咳嗽,一天
