In [1]:
from data_tool import csv_to_list, list_to_csv, flat_json
import jieba  
import jieba.posseg as pseg  
import os  
import sys  
from sklearn import feature_extraction  
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer 
import numpy as np
import pickle
from scipy.sparse import coo_matrix, vstack

In [2]:
# numpy array set-up
np.set_printoptions(threshold=np.inf)

In [3]:
def train_tfidf_vectorizer(flat_train_msg_list, path_to_save):
    """
    Train and save the tfidf vectorizer as pickle file.

    :param path_to_save: path to save the trained tfidf
    :param flat_train_msg_list: data to train the vectorizer, in flat list format.
    """
    corpus = [" ".join(jieba.cut(msg)) for msg in flat_train_msg_list]
    vectorizer=CountVectorizer()# 该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频  
    transformer=TfidfTransformer()# 该类会统计每个词语的tf-idf权值
    transformer.fit_transform(vectorizer.fit_transform(corpus)) # 用 fit_transform 和 training data 訓練模型
    with open(path_to_save,"wb") as outfile: # 將模型存至硬碟
        pickle.dump((vectorizer,transformer),outfile)

In [4]:
class Transform:
    def __init__(self,tfidf_model_path):
        self.tfidf_model_path = tfidf_model_path
        self.vectorizer, self.transformer = self.get_tfidf_vectorizer()
    def get_tfidf_vectorizer(self):
        """
        Load tfidf vectorizer from hard disk.

        :return: vectorizer, transformer object of sklearn tfidf object
        :rtype: vectorizer, transformer object of sklearn tfidf object
        """
        with open(self.tfidf_model_path,"rb") as infile: 
            return pickle.load(infile)
    def to_sparse_matrix(self,flat_message_list):
        """
        Transform flat message data list to sparse matrix, for param X of sklearn fit method.

        :param flat_message_list: message_list in flat list format.
        :return: { sparse matrix}, shape (n_samples, n_features) for sklearn fit method
        :rtype: scipy.sparse.csr_matrix: Compressed Sparse Row format.
        """
        return vstack([self.transformer.transform(self.vectorizer.transform([" ".join(jieba.cut(msg))])) 
                     for msg in flat_message_list])    
    def to_ndarray(self,flat_lebel_list):
        """
        Transform flat label data list to ndarray, for param y of sklearn fit method.

        :param flat_lebel_list: label_list in flat list format.
        :return: array-like, shape (n_samples,) for sklearn fit method
        :rtype: ndarray
        """
        return np.array(flat_lebel_list)

In [300]:
# 從硬碟中取出 已標記好、分好的訓練/測試資料
train_post_data_list = csv_to_list("data_106/exp_train.csv")
test_post_data_list = csv_to_list("data_106/exp_test.csv")

In [301]:
# 資料打平
flat_train_post_list = flat_json(train_post_data_list)
flat_test_post_list = flat_json(test_post_data_list)

In [302]:
# 訓練 tfidf 模型，存入硬碟
train_tfidf_vectorizer(flat_train_post_list['message'],"data_106/feature_product.pkl")

In [303]:
# 將資料轉換成 sklearn machine learning 格式
tf = Transform("data_106/feature_product.pkl")
train_tfidf_X = tf.to_sparse_matrix(flat_train_post_list['message'])
train_y = tf.to_ndarray(flat_train_post_list['label'])
test_tfidf_X = tf.to_sparse_matrix(flat_test_post_list['message'])
test_y = tf.to_ndarray(flat_test_post_list['label'])

In [304]:
len(train_y)

183

In [305]:
len(test_y)

61

# Machine Learning Time

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn import tree

In [41]:
# 正確率
def show_accuracy_report(prediction, test_y):
    accuracy = np.mean(prediction==test_y)
    correct_idx = np.where(prediction==test_y)[0]
    incorrect_idx = np.where(prediction!=test_y)[0]
    print "accuracy:",accuracy
    print "correct_idx:",correct_idx
    print "incorrect_idx:",incorrect_idx 
    print classification_report(test_y, prediction)

In [236]:
def check_one(model,idx):
    print "predict : ", model.predict(tf.to_sparse_matrix([flat_test_post_list['message'][idx]]))
    print "reality : ", flat_test_post_list['label'][idx]
    print "message : ", flat_test_post_list['message'][idx]
    print "probability : ",model.predict_proba(tf.to_sparse_matrix([flat_test_post_list['message'][idx]]))




In [313]:
lg = LogisticRegression()
lg_model = lg.fit(train_tfidf_X,train_y)

In [314]:
prediction = lg_model.predict(test_tfidf_X)

In [315]:
show_accuracy_report(prediction, test_y)

accuracy: 0.819672131148
correct_idx: [ 0  1  2  3  4  5  6  7  8 10 11 12 13 14 16 17 18 19 20 21 22 23 25 26 27
 28 29 30 31 33 34 35 36 37 38 39 40 41 42 46 47 49 50 51 53 55 56 57 59 60]
incorrect_idx: [ 9 15 24 32 43 44 45 48 52 54 58]
             precision    recall  f1-score   support

          1       0.79      0.71      0.75        21
          2       0.73      0.80      0.76        20
          3       0.95      0.95      0.95        20

avg / total       0.82      0.82      0.82        61



In [317]:
check_one(lg_model,24)

predict :  [2]
reality :  1
message :  [週日動動腦]
大家都吃過新品火腿起司了嗎??
猜猜看sal'e是甚麼意思呢?
probability :  [[ 0.35656346  0.37199587  0.27144067]]
