# 將 training data set 標記類別，儲存為excel格式

In [1]:
import xlwt
import json
from collections import OrderedDict

In [2]:
def json_to_excel(train_or_test,json_file_name, excel_file_name):
    with open('data/'+json_file_name,'r') as f:
        json_file = json.load(f)
    wb = xlwt.Workbook()
    ws = wb.add_sheet("My Worksheet")
    for idx,post in enumerate(json_file[train_or_test+'_X']):
        ws.write(idx, 0, label = post['pid'])
        ws.write(idx, 1, label = post['message'])
        ws.write(idx, 2, label = post['post_type'])
    wb.save(excel_file_name)

In [3]:
#json_to_excel('train',"train_data_all_2.json","train_post_class.xls")

# 輸出新的分類

In [4]:
import xlrd

In [5]:
def get_myclass_list(xls_path):
    myclass_list = []
    wb = xlrd.open_workbook(xls_path)
    sh = wb.sheet_by_index(0)
    for rownum in range(1, sh.nrows):
        row_values = sh.row_values(rownum)
        myclass_list.append(int(row_values[4]))
    return myclass_list

In [6]:
def change_new_class(train_or_test):
    myclass_list = get_myclass_list(train_or_test+"_post_class.xls")
    with open('data/'+train_or_test+'_data_all_2.json','r') as f:
        load_dict = json.load(f)
        load_dict[train_or_test+'_y'] = myclass_list
    with open('data/'+train_or_test+'_data_all_2.json','w') as f:
        json.dump(load_dict,f)

In [7]:
#change_new_class('train')

# 結巴

In [8]:
import jieba
import jieba.analyse

In [9]:
jieba.set_dictionary("dict.txt.big")
jieba.analyse.set_stop_words("data/household_stop_words.txt")

# TextMining

In [10]:
class TextMining:
    def __init__(self,keyword_dict_path):
        self.keyword_dict_path = keyword_dict_path
        self.keyword_dict = None
    def get_keyword_dict(self,refresh=False):
        if self.keyword_dict == None or refresh == True:
            with open(self.keyword_dict_path,'r') as infile:
                self.keyword_dict = json.load(infile, object_pairs_hook=OrderedDict)
        return self.keyword_dict
    def make_feature_vector(self,post_dict,print_out=False):
        key_dict = self.get_keyword_dict()
        feature_vector = []
        resultList = list(jieba.cut(post_dict['message'])) if post_dict['message'] is not None else []
        for typology in key_dict:
            word_count_list = [resultList.count(word) for word in key_dict[typology]]
            total_count = sum(word_count_list) if len(resultList)!=0 else 0
            if typology=="service":
                total_count += 1 if post_dict['post_type'] == 'share' else 0
            if typology=="engage_photovideo_direction":
                total_count += 1 if post_dict['post_type'] != 'share' else 0
            feature_vector.append(total_count)
            if print_out :
                match_word = ""
                for idx, word_count in enumerate(word_count_list):
                    if word_count != 0:
                        match_word+=key_dict[typology][idx]+' count '+str(word_count)+','
                print "%s (term count :%d)  %s" % (typology,total_count,match_word)
        return feature_vector

In [86]:
def trans_feature_vector(X,print_out=False):
    feature_vector_list = []
    all_TextMining = TextMining("data/keyword_all_2.json")
    for post in X :
        feature_vector_list.append(all_TextMining.make_feature_vector(post,print_out))
    return feature_vector_list

# Machine learning

In [128]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import numpy as np
from sklearn.metrics import classification_report

In [138]:
class ML:
    def __init__(self,method):
        self.method = method
        self.model = None
    def get_model(self,train_X,train_y,refresh=False):
        # 將post dict 轉換成數字向量
        train_X_feature_vector_list = trans_feature_vector(train_X)
        if self.model == None or refresh == True:
            if self.method == 'lg':
                clf = LogisticRegression()
            elif self.method == 'svm':
                clf = svm.SVC()
            else:
                print "no method name ", self.method
                return
            clf_model = clf.fit(np.array(train_X_feature_vector_list),np.array(train_y))
            self.model = clf_model
        return self.model
    def _predict(self,X,print_out=False):
        # 將post dict 轉換成數字向量
        X_feature_vector_list = trans_feature_vector(X,print_out)
        if self.model != None:
            prediction = self.model.predict(np.array(X_feature_vector_list))
            return prediction
        else:
            return None
    def accuracy(self,X,y):
        prediction = self._predict(X)
        if prediction is not None:
            accuracy = np.mean(prediction==np.array(y))
            correct_idx = np.where(prediction==np.array(y))[0]
            incorrect_idx = np.where(prediction!=np.array(y))[0]
            print "accuracy:",accuracy
            print "correct_idx:",correct_idx
            print "incorrect_idx:",incorrect_idx 
        else:
            print "prediction is None"
    def checkOne(self,X,y,idx):
        prediction = self._predict([X[idx]],print_out=True)
        print "-------------------------------------------------"
        print "predict class : ", prediction
        print "true class : ", y[idx]
        print X[idx]['message']
    def _classification_report(self,X,y):
        prediction = self._predict(X)
        target_names = ['class 0 促銷', 'class 1 客戶', 'class 2 產品','class 3 品牌']
        report = classification_report(y, prediction, target_names=target_names)
        print report

In [123]:
# 載入訓練資料
with open("data/train_data_all_2.json","r") as r_f:
    train_data = json.load(r_f)

In [139]:
ML_lg = ML('lg')

In [140]:
ML_lg.get_model(train_data['train_X'],train_data['train_y'],True)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [126]:
ML_lg.accuracy(train_data['train_X'],train_data['train_y'])

accuracy: 0.656666666667
correct_idx: [  0   2   4   5   6   7   8  10  11  13  15  16  19  22  23  24  27  28
  29  31  32  33  34  35  36  38  39  40  41  43  44  45  47  48  49  50
  51  53  54  55  56  57  58  59  60  61  62  63  64  65  66  68  69  75
  78  80  82  85  86  89  93  94  95  96  99 102 104 106 109 110 111 112
 113 115 118 119 120 121 123 124 125 126 128 130 132 133 134 135 136 137
 138 140 142 143 144 145 148 149 150 152 153 154 156 157 158 159 161 162
 163 164 166 167 168 169 170 171 172 173 174 175 177 178 182 183 184 187
 188 189 190 192 193 194 195 196 198 202 203 205 206 209 210 212 214 215
 216 217 218 219 220 222 224 226 227 228 229 230 236 237 238 239 240 242
 244 245 246 248 252 254 255 256 257 259 260 261 262 263 265 266 267 271
 272 273 276 277 278 280 283 284 285 286 287 288 289 290 295 297 298]
incorrect_idx: [  1   3   9  12  14  17  18  20  21  25  26  30  37  42  46  52  67  70
  71  72  73  74  76  77  79  81  83  84  87  88  90  91  92  97  98 100
 

In [120]:
ML_lg.checkOne(train_data['train_X'],train_data['train_y'],14)

brand (term count :0)  
social (term count :0)  
service (term count :1)  
engage_assist (term count :1)  關鍵 count 1,
engage_like (term count :0)  
engage_question_appreciate (term count :0)  
engage_photovideo_direction (term count :0)  
product_new (term count :0)  
product_version (term count :0)  
product_unit (term count :0)  
product_test (term count :0)  
promotion_deal (term count :0)  
promotion_chance (term count :0)  
season (term count :0)  
-------------------------------------------------
predict class :  [3]
true class :  1
做油飯很困難很麻煩？一點也不！
其實只要掌握這五個關鍵，油飯幾乎不會失敗還保證簡單又好吃！
蠔油真的很萬用，配上各式各樣的食材真的都能烹調出最美味的佳餚！

#李錦記甘甜醬油露
#李錦記舊庄特級蠔油


In [141]:
ML_lg._classification_report(train_data['train_X'],train_data['train_y'])

                precision    recall  f1-score   support

class 0 促銷       0.71      0.51      0.59        47
class 1 客戶       0.74      0.52      0.61        82
class 2 產品       0.61      0.84      0.71       118
class 3 品牌       0.66      0.58      0.62        53

   avg / total       0.67      0.66      0.65       300

