# Train Classifier with TFIDF

In [4]:
import csv
import jieba
import re
import random
import numpy as np

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
# from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from collections import defaultdict, Counter

## type dict
Grammar = {'完成式': 1, '進行式': 2, '過去式': 3, '未來式': 4, '關係代名詞': 5, '不定詞': 6, '名詞子句': 7, 
           '被動': 8, '介係詞': 9, '連接詞': 10, '假設語氣': 11, '分詞': 12, 'PT': 13, '其它': 0}

In [5]:
Grammar = {'1': '完成式', '2': '進行式', '3': '過去式', '4': '未來式', '5': '關係代名詞', '6': '不定詞', '7': '名詞子句', '8': '被動', '9': '介係詞', \
           '10': '連接詞', '11': '假設語氣', '12': '分詞', '13': 'PT', '0': '其它'}

In [6]:
with open('questions_nondup_dup.csv') as csvfile:
    data_dict = defaultdict()
    for row in csv.DictReader(csvfile):
        data_dict[row['question_id']] = row

## splitting data

In [7]:
from collections import defaultdict

class DataHelper(object):
    def __init__(self, file):
        self.file = file
        self.stopwords = ['什麼', '請問', '這裡', '不是', '意思', '這邊', '謝謝', '這句', '為何', '使用', '怎麼', '要加', '老師', '還是', '如何', '甚麼', '一下', '這個', '這樣', '問為', '因為', '何要', '用過', '是不是', '一個', '應該', '直接', '好像', '如果', '何不', '兩個', '這是', '何用', '需要', '時候', '所以', '您好', '起來', '還有', '加上', '寫成', '你好', '此句', '有點', '問此', '不好意思', '不到', '像是', '這裏', '為什麼']
        
        with open('{0}'.format(self.file)) as data_file:
            self.unamb_data = defaultdict(list)
            self.amb_data = defaultdict(list)
            for row in csv.DictReader(data_file):
                if row['ambiguous'] == '0':
                    # can't directly use row.values() as it doesn't grantee the order
                    self.unamb_data[row['type']].append([row['question_id'], row['member_id'], \
                                                         row['type'], row['question'], row['ambiguous']])
                else:
                    self.amb_data[row['type']].append([row['question_id'], row['member_id'], \
                                                         row['type'], row['question'], row['ambiguous']])
                    
    def get_all_unambiguous_data(self):
        X = []
        y = []
        member_id = []
        question_id = []
        for key, record in self.unamb_data.items():
            if key == '13':
                continue
            questions = list(list(zip(*record))[3]) 
            members = list(list(zip(*record))[1]) # get memberid list from records
            question_idx = list(list(zip(*record))[0])
            X += questions
            y += [key]*len(questions)
            member_id += members
            question_id += question_idx
            
        X_text = self.cut_questions(X)
        return X_text, np.array(y), member_id, question_id
        
    def get_shuffled_data(self, ratio = 8):
        X_train = []
        X_test = []
        Y_train = []
        Y_test = []
        member_train = []
        member_test = []
        question_train = []
        question_test = []
        for key, record in self.unamb_data.items():
            if key == '13':
                continue
            questions = list(list(zip(*record))[3]) # get question list from records
            members = list(list(zip(*record))[1]) # get memberid list from records
            question_idx = list(list(zip(*record))[0])
            random.shuffle(questions)
            split_point = len(questions)*ratio//10
            train = questions[:split_point]
            test = questions[split_point:]
            member_train += members[:split_point]
            member_test += members[split_point:]
            question_train += question_idx[:split_point]
            question_test += question_idx[split_point:]
            X_train += train
            X_test += test
            Y_train += [key]*len(train) # repeat len(train) times
            Y_test += [key]*len(test)
            
        X_train_text = self.cut_questions(X_train)
        X_test_text = self.cut_questions(X_test)
        return X_train_text, np.array(Y_train), X_test_text, np.array(Y_test), member_train, member_test, question_train, question_test
    
    # use non-duplications as training and duplications as testing
    # the file should be questions_nondup_dup.csv
    def get_fixed_data(self):
        X_train = []
        X_test = []
        Y_train = []
        Y_test = []
        member_train = []
        member_test = []
        question_train = []
        question_test = []
        
        for key, record in self.unamb_data.items():
            if key == '13':
                continue
            questions = list(list(zip(*record))[3]) # get question list from records
            members = list(list(zip(*record))[1]) # get memberid list from records
            question_idx = list(list(zip(*record))[0])
            X_train += questions
            Y_train += [key]*len(questions)
            member_train += members
            question_train += question_idx
        for key, record in self.amb_data.items():
            if key == '13':
                continue
            questions = list(list(zip(*record))[3]) # get question list from records
            members = list(list(zip(*record))[1]) # get memberid list from records
            question_idx = list(list(zip(*record))[0])
            X_test += questions
            Y_test += [key]*len(questions)
            member_test += members
            question_test += question_idx
            
        X_train_text = self.cut_questions(X_train)
        X_test_text = self.cut_questions(X_test)
        return X_train_text, np.array(Y_train), X_test_text, np.array(Y_test), member_train, member_test, question_train, question_test
        
    def cut_questions(self, data):
        corpus = []
        for q in data:
            segs = jieba.cut(q, cut_all=False)
            final = [seg for seg in segs if seg not in self.stopwords]
            corpus.append(' '.join(final))
        return corpus

In [9]:
dh = DataHelper('questions_nondup_dup2.csv')
print(dh.unamb_data['2'][:2])
print(dh.amb_data['2'][:2])

[['30383', '56291', '2', '這裡的 letting 加ing是因為also的關係嗎?  You wanna be getting to know a person 是未來進行式吧?並不是因為 also 的關係喔，是因為接續了前面的 wanna be，完整一點應該是： ...and you also wanna be letting that person get to know you.  因為和前面共用一個 wanna be，所以後面省略。  這裡因為沒有 will，所以不算是未來進行式，但現在進行式本身也有未來的意涵喔。', '0'], ['28854', '56291', '2', "you'll be speaking 為甚麼speak要加ing 還有這句要如何解釋這裡是「未來進行式」的用法，表示某一動作將會、或可能在未來某一時刻進行或持續進行中。  you'll be speaking 就是「你未來、以後都會這樣說」的意思。  這裡用 you'll speak 當然也沒有問題，只是語意上有些許差別而已。", '0']]
[['32440', '88024', '2', '為什麼wanna後面要用be getting而不是直接用wanna get to ... ??這裡是在 want to 後面加上現在進行式（be 動詞+現在分詞）的用法，有「一直做、到未來也要做這件事」的口吻。  當然也可以只用 want to get to know，只是口氣上有些微差別而已，但兩種表達方式大致意思是一樣的。', '1'], ['29663', '56291', '2', " be making friends 這裡是未來進行式嗎?   throughout 這裡昰介係詞嗎您好！  1. 是的，這裡用 going to be making friends 表示現在、未來都會要去交朋友的意思，有動作延續的口吻。當然直接寫成 we're going to make friends 也是完全沒問題的，只是口吻上有一些些差別而已。  2. throughout 在這裡是介係詞沒錯喔，另外也可以參考字典上第 1 條解釋： https://tw.dictionary.yahoo.com/dictionary?p=throughout ", '1']]


### get shuffled data

In [88]:
X_train_text, y_train, X_test_text, y_test, member_train, member_test, question_train, question_test = dh.get_shuffled_data()
print('X train shape: {}'.format(X_train.shape))
print('y train shape: {}'.format(y_train.shape))

X train shape: (3095, 9652)
y train shape: (3095,)


## extract features of text

In [10]:
class TextFeature(object):
    def __init__(self, training_data, testing_data):
        self.training_text = training_data
        self.testing_text = testing_data
        
    def get_tfidf(self, use_idf = True):
#         texts = self.training_text + self.testing_text
        tfidf_vectorizer = TfidfVectorizer(use_idf = True)
        tfidf_vectorizer.fit(self.training_text)
        X_train = tfidf_vectorizer.transform(self.training_text)
        X_test = None
        if self.testing_text != None:
            X_test = tfidf_vectorizer.transform(self.testing_text)
        return X_train, X_test

## get features

In [90]:
tf = TextFeature(X_train_text, X_test_text)
X_train, X_test = tf.get_tfidf()
print(X_train.shape)
print(X_test.shape)

(3095, 9680)
(780, 9680)


## Corss Validation

### Get all data and get features

In [21]:
X_train_text, y_train, X_test_text, y_test, member_train, member_test, question_train, question_test = dh.get_fixed_data()

print(X_train_text[:3])
print(X_test_text[:3])

['She   told   me   after   she   was   diagnosed   that   death   was   not   what   saddened   her   the   most .     請 加 what 的   what   saddened   her   the   most   是 一整 個 名詞 子句 ， what   是 「 所 ... 的 事物 」 的 ，   what   saddened   her   the   most   就是 「 讓 她 最 傷心 的 事 」 的 。       death   was   not   what   saddened   her   the   most   就是 在 說 ： 死亡 並不事 讓 她 最 傷心 的 事 。     類似 用法 可以 參考 奇摩 字典 ， what   的 第三 個 解釋 ：   https : / / tw . dictionary . yahoo . com / dictionary ? p = what', '在   caused   前 省略 了 that   was     那平時 ， 我們 該 怎麼察覺 原本 有 東西 被 省略 呢 ?   然 後 是 名詞 子句 嗎 ? 這 就是 平時 我們 會察覺 有 省略 的 ， 小老師 很 難 告訴 你 要 觀察 ， 但是 我們 在 教材 有 省略 的 地方 都 一定 會解 說 ， 這 真的 就是 要 靠 多 去 練習 、 熟悉 這種 用法 ， 自然 就 會 看得出 來 了 。     另外 ， 一句 完整 的 句子 ， 並 名詞 子句 ， 子句 是 用 在 有 主詞 動詞 但是 「 不 完整 」 的 句子 裡 。', '為 的 句型 結構 Whether   S + V ,   S + V   呢 也 有 whether   or   not ... 的 用法 喔 。   一定 要 把 or   not 拿到 前面 來 ， 是 these   insects   go   out   experiencing   the   greatest   caffeine   high   ever 太長 了 。 其實 結構 是 可以 理解   Whether     ( thes

In [22]:
tf = TextFeature(X_train_text, X_test_text)
X_train, X_test = tf.get_tfidf()

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import multiprocessing as mp

### Naive Bayes corss validation

In [24]:
NB_cv = Pipeline([('cls', MultinomialNB()),])
parameters = {'cls__alpha': (0.5, 0.8, 1.0, 5, 10)}
gs_cls = GridSearchCV(NB_cv, param_grid = parameters, cv = 10, n_jobs = mp.cpu_count()-1)
gs_cls = gs_cls.fit(X_train.todense(), y_train)



In [25]:
print('Best Paras:', gs_cls.best_params_)
y_predict = gs_cls.predict(X_train)
y_predict_prob = gs_cls.predict_proba(X_train)
infile = 'predicted/NB_unamb_predict.csv'
cat = [Grammar[item] for item in gs_cls.best_estimator_.classes_]
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_train)):
        writestring = [question_train[i], data_dict[question_train[i]]['member_id'], data_dict[question_train[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)
        
print(metrics.classification_report(y_train, y_predict))

y_predict_prob = gs_cls.predict_proba(X_test)
infile = 'predicted/NB_amb_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)

Best Paras: {'cls__alpha': 0.5}
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         2
          1       1.00      0.83      0.90       207
         10       0.89      0.92      0.91       528
         11       0.92      0.19      0.32        57
         12       0.91      0.94      0.93       515
          2       0.96      0.80      0.87       193
          3       0.92      0.85      0.88       448
          4       0.00      0.00      0.00        35
          5       0.92      0.94      0.93       462
          6       1.00      0.09      0.17        96
          7       1.00      0.11      0.20        88
          8       0.98      0.50      0.66       244
          9       0.70      0.99      0.82      1000

avg / total       0.86      0.84      0.82      3875



  'precision', 'predicted', average, warn_for)


### Random Forest cross validation

In [None]:
RF_cv = Pipeline([('cls', RandomForestClassifier()),])
parameters = {'cls__n_estimators': (20, 64, 128, 256),
              'cls__max_features': ['auto', 'sqrt', 'log2']}
gs_cls = GridSearchCV(RF_cv, param_grid = parameters, cv = 10, n_jobs = mp.cpu_count()-1)
gs_cls = gs_cls.fit(X_train.todense(), y_train)



In [None]:
print('Best Paras:', gs_cls.best_params_)
y_predict = gs_cls.predict(X_train)
y_predict_prob = gs_cls.predict_proba(X_train)

infile = 'predicted/RF_unamb_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_train)):
        writestring = [question_train[i], data_dict[question_train[i]]['member_id'], data_dict[question_train[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)
        
print(metrics.classification_report(y_train, y_predict))

y_predict_prob = gs_cls.predict_proba(X_test)
infile = 'predicted/RF_amb_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)

### SVM cross validation

In [None]:
SVM_cv = Pipeline([('cls', SVC()),])
parameters = {'clf__kernel': ('linear', 'rbf', 'sigmoid'),
              'clf__C': (0.01, 0.1, 1.0, 5, 10),
              'clf__gamma': ('auto', 0.1, 1, 10)}
gs_cls = GridSearchCV(SVM_cv, param_grid = parameters, cv = 10, n_jobs = mp.cpu_count()-1)
gs_cls = gs_cls.fit(X_train.todense(), y_train)

In [None]:
print('Best Paras:', gs_cls.best_params_)
y_predict = gs_cls.predict(X_train)
y_predict_prob = gs_cls.predict_proba(X_train)

infile = 'predicted/SVM_unamb_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_train)):
        writestring = [question_train[i], data_dict[question_train[i]]['member_id'], data_dict[question_train[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)
        
print(metrics.classification_report(y_train, y_predict))


y_predict_prob = gs_cls.predict_proba(X_test)
infile = 'predicted/SVM_unamb_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)

## Naive Bayes

In [93]:
NB = MultinomialNB(alpha = 1.0)
NB.fit(X_train.todense(), y_train)
y_predict = NB.predict(X_test.todense())
print(metrics.classification_report(y_test, y_predict))

y_predict_prob = NB.predict_proba(X_test.todense())
infile = 'predicted/NB_question_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    print(len(question_test))
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       0.96      0.55      0.70        42
         10       0.90      0.65      0.75       106
         11       0.00      0.00      0.00        12
         12       0.83      0.84      0.84       103
          2       0.87      0.51      0.65        39
          3       0.86      0.71      0.78        90
          4       0.00      0.00      0.00         7
          5       0.87      0.81      0.84        93
          6       0.00      0.00      0.00        20
          7       0.00      0.00      0.00        18
          8       1.00      0.20      0.34        49
          9       0.52      0.99      0.69       200

avg / total       0.73      0.70      0.67       780

780


  'precision', 'predicted', average, warn_for)


## Random Forest

In [59]:
RF  = RandomForestClassifier(n_jobs=-1, max_features="sqrt", n_estimators=128)
RF.fit(X_train.todense(), y_train)
y_predicted = RF.predict(X_test)
print(metrics.classification_report(y_test, y_predicted))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       1.00      0.95      0.98        42
         10       0.96      0.93      0.95       106
         11       1.00      0.83      0.91        12
         12       0.95      0.97      0.96       103
          2       0.97      0.87      0.92        39
          3       0.90      0.88      0.89        90
          4       1.00      0.57      0.73         7
          5       0.87      0.99      0.92        93
          6       1.00      0.85      0.92        20
          7       1.00      0.61      0.76        18
          8       0.95      0.84      0.89        49
          9       0.87      0.95      0.91       200

avg / total       0.92      0.92      0.92       780



  'precision', 'predicted', average, warn_for)


## SVM

In [60]:
svc = LinearSVC(C=1.0, max_iter=10000)
svc = svc.fit(X = X_train.todense(), y = y_train)
y_predict = svc.predict(X = X_test)
print(metrics.classification_report(y_test, y_predict))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       1.00      1.00      1.00        42
         10       0.95      0.94      0.95       106
         11       1.00      0.67      0.80        12
         12       0.93      0.96      0.94       103
          2       0.95      0.92      0.94        39
          3       0.93      0.91      0.92        90
          4       1.00      1.00      1.00         7
          5       0.93      0.97      0.95        93
          6       0.90      0.95      0.93        20
          7       0.80      0.67      0.73        18
          8       0.95      0.86      0.90        49
          9       0.92      0.95      0.94       200

avg / total       0.93      0.93      0.93       780



  'precision', 'predicted', average, warn_for)


### get fixed data

In [61]:
X_train_text, y_train, X_test_text, y_test, member_train, member_test, question_train, question_test = dh.get_fixed_data()
print('X train shape: {}'.format(len(X_train_text)))
print('y train shape: {}'.format(y_train.shape))

X train shape: 3875
y train shape: (3875,)


In [62]:
tf = TextFeature(X_train_text, X_test_text)
X_train, X_test = tf.get_tfidf()
print(X_train.shape)
print(X_test.shape)

(3875, 10760)
(1949, 10760)


## write predicted results into file

In [65]:
NB = MultinomialNB(alpha = 1.0)
NB.fit(X_train.todense(), y_train)
y_predict_prob = NB.predict_proba(X_test.todense())
cat = [Grammar[item] for item in NB.classes_]
out_NB = 'predicted/NB_predicted.csv'
with open(out_NB, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)

In [66]:
RF  = RandomForestClassifier(n_jobs=-1, max_features="sqrt", n_estimators=128)
RF.fit(X_train.todense(), y_train)
y_predict_prob = RF.predict_proba(X_test.todense())
cat = [Grammar[item] for item in RF.classes_]
out_RF = 'predicted/RF_predicted.csv'
with open(out_RF, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)

In [223]:
SVC = LinearSVC(C=1.0, max_iter=10000)
SVC = SVC.fit(X = X_train.todense(), y = y_train)
y_predict = SVC.predict(X_test.todense())
# cat = [Grammar[item] for item in SVC.classes_]
out_SVC = 'predicted/SVC_predicted.csv'
with open(out_SVC, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += [Grammar[y_predict[i]]]
        spamwriter.writerow(writestring)

## Convert csv to json

In [None]:
import json
def conver2json(infile, outfile = 'predicted/NB_question_predict.json'):
    predict_dict = defaultdict(dict)

    predict_dict = defaultdict(dict)
    with open(infile, 'r') as csvfile:
        for row in csv.DictReader(csvfile):
            predict_dict[row['question_id']]['member_id'] = row['member_id']
            predict_dict[row['question_id']]['question'] = row['question']
            predict_dict[row['question_id']]['reply'] = 'NO!'
            predict_dict[row['question_id']]['其它'] = row['其它']
            predict_dict[row['question_id']]['完成式'] = row['完成式']
            predict_dict[row['question_id']]['連接詞'] = row['連接詞']
            predict_dict[row['question_id']]['假設語氣'] = row['假設語氣']
            predict_dict[row['question_id']]['分詞'] = row['分詞']
            predict_dict[row['question_id']]['進行式'] = row['進行式']
            predict_dict[row['question_id']]['過去式'] = row['過去式']
            predict_dict[row['question_id']]['未來式'] = row['未來式']
            predict_dict[row['question_id']]['關係代名詞'] = row['關係代名詞']
            predict_dict[row['question_id']]['不定詞'] = row['不定詞']
            predict_dict[row['question_id']]['名詞子句'] = row['名詞子句']
            predict_dict[row['question_id']]['被動'] = row['被動']
            predict_dict[row['question_id']]['介係詞'] = row['介係詞']
            predict_dict[row['question_id']]['question_type'] = 0
    
    with open(outfile, 'w') as jsonfile:
        json.dump(predict_dict, jsonfile)

In [None]:
infile = 'predicted/NB_question_predicted.csv'
outfile = 'predicted/NB_question_predicted.json'
conver2json(infile)