# Train Classifier with TFIDF

In [1]:
import csv
import jieba
import re
import random
import numpy as np

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC

In [6]:
class TextFeature(object):
    def __init__(self, file):
        self.file = file
        self.stopwords = ['什麼', '請問', '這裡', '不是', '意思', '這邊', '謝謝', '這句', '為何', '使用', '怎麼', '要加', '老師', '還是', '如何', '甚麼', '一下', '這個', '這樣', '問為', '因為', '何要', '用過', '是不是', '一個', '應該', '直接', '好像', '如果', '何不', '兩個', '這是', '何用', '需要', '時候', '所以', '您好', '起來', '還有', '加上', '寫成', '你好', '此句', '有點', '問此', '不好意思', '不到', '像是', '這裏', '為什麼']
        
        with open('{0}'.format(self.file)) as data_file:
            self.data = {}
            num = 0
            for row in csv.DictReader(data_file):
                if row['type'] in self.data:
                    self.data[row['type']].append(row['question'])
                else:
                    self.data[row['type']] = []
                    
                num += 1
        data_file.close()
        
        self.X_train = []
        self.X_test = []
        self.Y_train = []
        self.Y_test = []
        for key, questions in self.data.items():
            if key != '14':
                random.shuffle(questions)
                split_point = len(questions)*8//10
                train = questions[:split_point]
                test = questions[split_point:]
                self.X_train += train
                self.X_test += test
                self.Y_train += [key]*len(train)
                self.Y_test += [key]*len(test)
            
        self.training_text = self.cut_questions(self.X_train)
        self.testing_text = self.cut_questions(self.X_test)
        
    def cut_questions(self, data):
        corpus = []
        for q in data:
            segs = jieba.cut(q, cut_all=False)
            final = [seg for seg in segs if seg not in self.stopwords]
            corpus.append(' '.join(final))
        return corpus
    
    def cal_tfidf(self):
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_vectorizer.fit(self.training_text)
        return tfidf_vectorizer
        
    def train_classifier_naive(self):
        tfidf_vectorizer = self.cal_tfidf() 
        X_train_set = tfidf_vectorizer.fit_transform(self.training_text)
        Y_train_set = np.array(self.Y_train)
        gnb = GaussianNB()
        gnb.fit(X_train_set.todense(), Y_train_set)
        
        X_test_set = tfidf_vectorizer.transform(self.testing_text)
        Y_predict = gnb.predict(X_test_set.todense())
        Y_test_set = np.array(self.Y_test)
        
        print(metrics.classification_report(Y_test_set, Y_predict))
        
    def train_classifier_random(self):
        tfidf_vectorizer = self.cal_tfidf()
        X_train_set = tfidf_vectorizer.fit_transform(self.training_text)
        Y_train_set = np.array(self.Y_train)
        self.classifier  = RandomForestClassifier(n_jobs=-1, max_features="sqrt", n_estimators=256)
        self.classifier.fit(X_train_set, Y_train_set)
        
        X_test_set = tfidf_vectorizer.transform(self.testing_text)
        Y_predict = self.classifier.predict(X_test_set)
        Y_test_set = np.array(self.Y_test)
        
        print(metrics.classification_report(Y_test_set, Y_predict))
        
    def train_classifier_SVM(self):
        tfidf_vectorizer = self.cal_tfidf()
        X_train_set = tfidf_vectorizer.fit_transform(self.training_text)
        Y_train_set = np.array(self.Y_train)
        # Perform classification with linear SVM
        svc = LinearSVC(C=1.0, max_iter=10000)
        svc = svc.fit(X=X_train_set, y=Y_train_set)
        
        X_test_set = tfidf_vectorizer.transform(self.testing_text)
        Y_predict = svc.predict(X=X_test_set)
        Y_test_set = np.array(self.Y_test)
        
        print(metrics.classification_report(Y_test_set, Y_predict))

In [7]:
questions = TextFeature('questions_nondup.csv')

In [8]:
questions.train_classifier_naive()

             precision    recall  f1-score   support

          1       0.38      0.43      0.40        42
         10       0.52      0.45      0.48       106
         11       0.25      0.55      0.34        11
         12       0.46      0.41      0.43        59
         13       0.55      0.50      0.53        42
          2       0.33      0.54      0.41        39
          3       0.44      0.34      0.39        90
          4       0.00      0.00      0.00         7
          5       0.32      0.31      0.31        91
          6       0.13      0.16      0.14        19
          7       0.03      0.06      0.04        18
          8       0.40      0.39      0.40        49
          9       0.50      0.45      0.47       200

avg / total       0.42      0.40      0.41       773



In [9]:
questions.train_classifier_random()

             precision    recall  f1-score   support

          1       0.78      0.86      0.82        42
         10       0.69      0.70      0.69       106
         11       1.00      0.64      0.78        11
         12       0.88      0.47      0.62        59
         13       0.59      0.45      0.51        42
          2       0.97      0.74      0.84        39
          3       0.72      0.64      0.68        90
          4       0.33      0.14      0.20         7
          5       0.68      0.63      0.65        91
          6       1.00      0.37      0.54        19
          7       0.83      0.56      0.67        18
          8       0.92      0.47      0.62        49
          9       0.56      0.86      0.68       200

avg / total       0.71      0.68      0.67       773



In [10]:
questions.train_classifier_SVM()

             precision    recall  f1-score   support

          1       0.92      0.81      0.86        42
         10       0.74      0.77      0.76       106
         11       0.80      0.73      0.76        11
         12       0.82      0.53      0.64        59
         13       0.74      0.67      0.70        42
          2       0.94      0.82      0.88        39
          3       0.73      0.73      0.73        90
          4       0.67      0.57      0.62         7
          5       0.65      0.68      0.67        91
          6       0.78      0.37      0.50        19
          7       0.60      0.33      0.43        18
          8       0.74      0.53      0.62        49
          9       0.66      0.85      0.74       200

avg / total       0.73      0.72      0.71       773

