# Train Classifier with TFIDF

In [31]:
import csv
import jieba
import re
import random
import numpy as np

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
# from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.externals import joblib

## type dict
Grammar = {'完成式': 1, '進行式': 2, '過去式': 3, '未來式': 4, '關係代名詞': 5, '不定詞': 6, '名詞子句': 7, 
           '被動': 8, '介係詞': 9, '連接詞': 10, '假設語氣': 11, '現在分詞': 12, '過去分詞': 13, 'PT': 14, '其它': 0}

## splitting data

In [43]:
class DataHelper(object):
    def __init__(self, file):
        self.file = file
        self.stopwords = ['什麼', '請問', '這裡', '不是', '意思', '這邊', '謝謝', '這句', '為何', '使用', '怎麼', '要加', '老師', '還是', '如何', '甚麼', '一下', '這個', '這樣', '問為', '因為', '何要', '用過', '是不是', '一個', '應該', '直接', '好像', '如果', '何不', '兩個', '這是', '何用', '需要', '時候', '所以', '您好', '起來', '還有', '加上', '寫成', '你好', '此句', '有點', '問此', '不好意思', '不到', '像是', '這裏', '為什麼']
        
        with open('{0}'.format(self.file)) as data_file:
            self.data = {}
            for row in csv.DictReader(data_file):
                if row['type'] in self.data:
                    self.data[row['type']].append(row.values())
                else:
                    self.data[row['type']] = []
                    
        data_file.close()
        
    def get_shuffled_data(self, ratio = 8):
        X_train = []
        X_test = []
        Y_train = []
        Y_test = []
        member_train = []
        member_test = []
        for key, record in self.data.items():
            if key == '14':
                continue
            questions = list(list(zip(*record))[2]) # get question list from records
            members = list(list(zip(*record))[1]) # get memberid list from records
            random.shuffle(questions)
            split_point = len(questions)*ratio//10
            train = questions[:split_point]
            test = questions[split_point:]
            member_train += members[:split_point]
            member_test += members[split_point:]
            X_train += train
            X_test += test
            Y_train += [key]*len(train)
            Y_test += [key]*len(test)
            
        X_train_text = self.cut_questions(X_train)
        X_test_text = self.cut_questions(X_test)
        return X_train_text, np.array(Y_train), X_test_text, np.array(Y_test), member_train, member_test
    
    # use non-duplications as training and duplications as testing
    # the file should be questions_nondup_dup.csv
    def get_fixed_data(self):
        X_train = []
        X_test = []
        Y_train = []
        Y_test = []
        member_train = []
        member_test = []
        try:
            for key, record in self.data.items():
                if int(record['dup']) == 0: # non-duplications
                    X_train.append(record['question'])
                    Y_train.append(record['type'])
                    member_train.append(record['member_id'])
                else:
                    X_test.append(record['question'])
                    Y_test.append(record['type'])
                    member_test.append(record['member_id'])
        except KeyError as e:
            print(e)
            raise
            
        X_train_text = self.cut_questions(self.X_train)
        X_test_text = self.cut_questions(self.X_test)
        return X_train_text, np.array(Y_train), X_text_text, np.array(Y_test), member_train, member_test
        
    def cut_questions(self, data):
        corpus = []
        for q in data:
            segs = jieba.cut(q, cut_all=False)
            final = [seg for seg in segs if seg not in self.stopwords]
            corpus.append(' '.join(final))
        return corpus

In [45]:
dh = DataHelper('questions_nondup.csv')
X_train_text, y_train, X_test_text, y_test, member_train, member_test = dh.get_shuffled_data()
print('y train shape: {}'.format(y_train.shape))
print('y test shape: {}'.format(y_test.shape))

y train shape: (3058,)
y test shape: (773,)


## extract features of text

In [46]:
class TextFeature(object):
    def __init__(self, training_data, testing_data):
        self.training_text = training_data
        self.testing_text = testing_data
        
    def get_tfidf(self, use_idf = True):
#         texts = self.training_text + self.testing_text
        tfidf_vectorizer = TfidfVectorizer(use_idf = True)
        tfidf_vectorizer.fit(self.training_text)
        joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
        X_train = tfidf_vectorizer.transform(self.training_text)
        X_test = tfidf_vectorizer.transform(self.testing_text)
        return X_train, X_test

## get features

In [47]:
tf = TextFeature(X_train_text, X_test_text)
X_train, X_test = tf.get_tfidf()
print(X_train.shape)
print(X_test.shape)

(3058, 4617)
(773, 4617)


## Naive Bayes

In [48]:
NB = MultinomialNB(alpha = 1.0)
NB.fit(X_train.todense(), y_train)
y_predict = NB.predict(X_test.todense())
print(metrics.classification_report(y_test, y_predict))
joblib.dump(NB, 'naive_bayes.pkl')

             precision    recall  f1-score   support

          1       0.95      0.50      0.66        42
         10       0.74      0.61      0.67       106
         11       1.00      0.09      0.17        11
         12       1.00      0.22      0.36        59
         13       0.88      0.17      0.28        42
          2       1.00      0.26      0.41        39
          3       0.71      0.52      0.60        90
          4       0.00      0.00      0.00         7
          5       0.80      0.45      0.58        91
          6       0.00      0.00      0.00        19
          7       0.00      0.00      0.00        18
          8       0.67      0.08      0.15        49
          9       0.39      0.99      0.56       200

avg / total       0.66      0.53      0.49       773



  'precision', 'predicted', average, warn_for)


['naive_bayes.pkl']

## Random Forest

In [52]:
RF  = RandomForestClassifier(n_jobs=-1, max_features="sqrt", n_estimators=256)
RF.fit(X_train.todense(), y_train)
y_predicted = RF.predict(X_test)
print(metrics.classification_report(y_test, y_predicted))
joblib.dump(RF, 'random_forest.pkl')

             precision    recall  f1-score   support

          1       0.77      0.79      0.78        42
         10       0.72      0.73      0.72       106
         11       1.00      0.27      0.43        11
         12       0.58      0.49      0.53        59
         13       0.69      0.43      0.53        42
          2       0.96      0.69      0.81        39
          3       0.79      0.58      0.67        90
          4       0.86      0.86      0.86         7
          5       0.72      0.68      0.70        91
          6       1.00      0.58      0.73        19
          7       0.71      0.56      0.63        18
          8       0.84      0.63      0.72        49
          9       0.60      0.89      0.72       200

avg / total       0.72      0.69      0.69       773



['random_forest.pkl']

## SVM

In [50]:
svc = LinearSVC(C=1.0, max_iter=10000)
svc = svc.fit(X = X_train.todense(), y = y_train)
y_predict = svc.predict(X = X_test)
print(metrics.classification_report(y_test, y_predict))
joblib.dump(svc, 'svc.pkl')

             precision    recall  f1-score   support

          1       0.81      0.81      0.81        42
         10       0.70      0.74      0.72       106
         11       0.71      0.45      0.56        11
         12       0.72      0.61      0.66        59
         13       0.53      0.45      0.49        42
          2       0.97      0.72      0.82        39
          3       0.76      0.70      0.73        90
          4       0.75      0.86      0.80         7
          5       0.70      0.71      0.71        91
          6       0.92      0.58      0.71        19
          7       0.67      0.44      0.53        18
          8       0.80      0.65      0.72        49
          9       0.68      0.85      0.76       200

avg / total       0.72      0.72      0.72       773



['svc.pkl']