In [1]:
import os
import sys
import numpy as np
import pandas as pd
import inspect

import regex as re
from texttable import Texttable

from scipy.sparse import csr_matrix
from sklearn import tree
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.preprocessing import StandardScaler

from TextClassify import BagOfWords

from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
topics = {
    'X0300001':'水泥',
    'X0300002':'食品加工',
    'X0300003':'石化',
    'X0300004':'紡織',
    'X0300005':'電機機械',
    'X0300006':'電器電纜',
    'X0300012':'科技相關',
    'X0300020':'生技醫療保健',
    'X1900238':'Apple Watch',
    'X1900273':'FinTech'
}

In [3]:
models = {
    'lr':{'path':'{}_lr.model', 'model':None, 'name':'linear regression'},
    'svm':{'path':'{}_svm.model', 'model':None, 'name':'SVM'},
    'lsvc':{'path':'{}_LinearSVC.model', 'model':None, 'name':'Linear SVC'},
    'nb':{'path':'{}_NaiveBayes.model', 'model':None, 'name':'Naive Bayes'},
    'knn':{'path':'{}_KNN.model', 'model':None, 'name':'KNN'},
    'tree':{'path':'{}_DecisionTrees.model', 'model':None,'name':'Decision Tree'},
    'esb':{'path':'{}_ensemble.model', 'model':None, 'name':'Ensemble'}
}

In [4]:
topic = 'X0300020'
data_dir = r'./data/{}'.format(topic)
dict_path = os.path.join(data_dir, '{}_dict.pkl'.format(topic))

In [5]:
# build BOW model
BOW = BagOfWords.BagOfWords(os.path.join(data_dir, 'dataset'))

User Dict loaded.


<h3>init word dictionary of BOW model</h3>

In [6]:
# build the dictionary & save it
if os.path.isfile(dict_path):
    BOW.load_dictionary(dict_path)
else:
    BOW.build_dictionary()
    BOW.save_dictionary(dict_path)

loaded dictionary from ./data/X0300020\X0300020_dict.pkl
done


<h3>load data</h3>

In [7]:
## LOAD DATA
threshold = 0.86 # save model if accuracy over threshold
X, y = BOW.transform_data(os.path.join(data_dir, 'dataset'), balance=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

transforming data in to bag of words vector
done


In [8]:
print(BOW.num_samples)

9102


<h3>load each model if model exist or train it.</h3>

In [9]:
t = Texttable()
rows = [['model', 'accuracy']]
for i, m in enumerate(list(models.keys())): 
    modelPath = os.path.join(data_dir, models[m]['path'].format(topic))
    if os.path.isfile(modelPath):
        models[m]['model'] = BOW.loadModel(modelPath)
    else:
        if m == 'lr':
            model = linear_model.LogisticRegression(C=1e5)
        elif m == 'svm':
            model = SVC()
        elif m == 'lsvc':
            model = LinearSVC()
        elif m == 'nb':
            model = MultinomialNB()
        elif m == 'knn':
            model = KNeighborsClassifier(n_neighbors=5)
        elif m == 'tree':
            model = tree.DecisionTreeClassifier()
        elif m == 'esb':
            model = VotingClassifier(
                estimators=[
                    ('lr', linear_model.LogisticRegression(random_state=42)), 
                    ('rf', RandomForestClassifier(random_state=42)), 
                    ('lsvm', tree.DecisionTreeClassifier(random_state=42)), 
                    ('nb', MultinomialNB()), 
                    ('extree',  ExtraTreesClassifier(random_state=42))
                ], voting='soft')
        model.fit(X_train, y_train)
        models[m]['model'] = model
    score = accuracy_score(y_test, models[m]['model'].predict(X_test))
    rows.append([models[m]['name'], score])
    
    if not os.path.isfile(modelPath) and score >= threshold:
        BOW.saveModel(model, modelPath)
t.add_rows(rows)
print(t.draw())

Load model from ./data/X0300020\X0300020_lr.model
Load model from ./data/X0300020\X0300020_LinearSVC.model
Load model from ./data/X0300020\X0300020_NaiveBayes.model
Load model from ./data/X0300020\X0300020_DecisionTrees.model
Load model from ./data/X0300020\X0300020_ensemble.model
+-------------------+----------+
|       model       | accuracy |
| linear regression | 0.951    |
+-------------------+----------+
| SVM               | 0.821    |
+-------------------+----------+
| Linear SVC        | 0.957    |
+-------------------+----------+
| Naive Bayes       | 0.880    |
+-------------------+----------+
| KNN               | 0.787    |
+-------------------+----------+
| Decision Tree     | 0.945    |
+-------------------+----------+
| Ensemble          | 0.958    |
+-------------------+----------+
