In [49]:
import os

DIR_PATH = './'
DATA_TRAIN_PATH = os.path.join(DIR_PATH, 'data/10_cate/Train_Full/')
DATA_TEST_PATH = os.path.join(DIR_PATH, 'data/10_cate/Test_Full/')
DATA_TRAIN_JSON = os.path.join(DIR_PATH, 'data/json/data_train.json')
DATA_TEST_JSON = os.path.join(DIR_PATH, 'data/json/data_test.json')
STOP_WORDS = os.path.join(DIR_PATH, 'stopwords-nlp-vi.txt')
SPECIAL_CHARACTER = '0123456789%@$.,=+-!;/()*"&^:#|\n\t\''
DICTIONARY_PATH = './model/another_topic/dictionary.txt'
MODEL_PATH ='model/linear_svc_model.pkl'
ESTIMATE_PATH ='model/another_topic/estimator_01.h5' 
LABEL_BIN_PATH ='model/another_topic/mlb_01.pkl'

In [38]:
import json
from gensim import corpora
import pickle as cPickle
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from pyvi import ViTokenizer
from gensim import matutils

class FileStore(object):
    def __init__(self, file_path, data = None):
        self.file_path = file_path
        self.data = data

    def store_json(self):
        with open(self.file_path, 'w') as outfile:
            json.dump(self.data, outfile)

    def store_dictionary(self, dict_words):
        dictionary = corpora.Dictionary(dict_words)
        dictionary.filter_extremes(no_below=20, no_above=0.3)
        dictionary.save_as_text(self.file_path)

    def save_pickle(self,  obj):
        outfile = open(self.file_path, 'wb')
        fastPickler = cPickle.Pickler(outfile, 4)
        fastPickler.fast = 1
        fastPickler.dump(obj)
        outfile.close()

class Classifier(object):
    def __init__(self, features_train = None, labels_train = None, features_test = None, labels_test = None,  estimator = LinearSVC(random_state=0)):
        self.features_train = features_train
        self.features_test = features_test
        self.labels_train = labels_train
        self.labels_test = labels_test
        self.estimator = estimator
    
    def load_model(self, model_path):  
        estimator_file = open(model_path, "rb") 
        self.estimator = cPickle.Unpickler(estimator_file).load() 
        estimator_file.close()
        # self.test("abc")
        self.__training_result()

    def training(self):
        self.estimator.fit(self.features_train, self.labels_train)
        self.__training_result()

    def save_model(self):
        print(type(object),"model saved")
        FileStore(file_path=MODEL_PATH).save_pickle(obj=object)
        FileStore(file_path=ESTIMATE_PATH).save_pickle(obj=self.estimator)

    def __training_result(self):
        y_true, y_pred = self.labels_test, self.estimator.predict(self.features_test)
        print(classification_report(y_true, y_pred))

class FileReader(object):
    def __init__(self, file_path, encoder = None):
        self.file_path = file_path
        self.encoder = encoder if encoder != None else 'utf-16le'

    def read(self):
        with open(self.file_path,'rb') as f:
            s = f.read() 
        return s

    def content(self):
        s = self.read() 
        return s.decode(self.encoder)

    def read_json(self):
        with open(self.file_path) as f:
            s = json.load(f)
        return s

    def read_stopwords(self):
        with open(self.file_path, 'r') as f:
            stopwords = set([w.strip().replace(' ', '_') for w in f.readlines()])
        return stopwords

    def load_dictionary(self):
        return corpora.Dictionary.load_from_text(self.file_path)

    def load_model(self):
        with open(self.file_path, mode='rb') as f:
            model = Classifier(cPickle.load(f))
            f.close() 
        return model

    def load_estimator(self): 
        #print(self.file_path)
        with open(self.file_path, mode='rb') as f:
            estimator = cPickle.load(f)
            f.close()
        return estimator
    


In [31]:
class NLP(object):
    def __init__(self, text = None):
        self.text = text
        self.__set_stopwords()

    def __set_stopwords(self):
        self.stopwords = FileReader(STOP_WORDS).read_stopwords()

    def segmentation(self):
        return ViTokenizer.tokenize(self.text)

    def split_words(self):
        text = self.segmentation()
        try:
            return [x.strip(SPECIAL_CHARACTER).lower() for x in text.split()]
        except TypeError:
            return []

    def get_words_feature(self):
        split_words = self.split_words()
        return [word for word in split_words if word.encode('utf-8') not in self.stopwords]

In [47]:
class FeatureExtraction(object):
    def __init__(self, data=None):
        self.data = data

    def __build_dictionary(self):
        print('Building dictionary')
        dict_words = []
        i = 0
        print( self.data)
        for text in self.data:
            i += 1
            print("Step {} / {}".format(i, len(self.data)))
            words = NLP(text = text['content']).get_words_feature()
            dict_words.append(words)
        FileStore(file_path=DICTIONARY_PATH).store_dictionary(dict_words)

    def __load_dictionary(self):
        #print(" os.path.exists(DICTIONARY_PATH) == False: " +  str(os.path.exists(DICTIONARY_PATH) == False))
        if os.path.exists(DICTIONARY_PATH) == False:
            self.__build_dictionary()
        self.dictionary = FileReader(DICTIONARY_PATH).load_dictionary()

    def __build_dataset(self):
        self.features = []
        self.labels = []
        i = 0
        for d in self.data:
            i += 1
            if i%500 == 0:
                print("Step {} / {}".format(i, len(self.data)))
            self.features.append(self.get_dense(d['content']))
            self.labels.append(d['category'])

    def get_dense(self, text):
        self.__load_dictionary()
        words = NLP(text).get_words_feature()
        # Bag of words
        vec = self.dictionary.doc2bow(words) 
        dense = list(matutils.corpus2dense([vec], num_terms=len(self.dictionary)).T[0])
        return dense

    def get_data_and_label(self):
        self.__build_dataset()
        return self.features, self.labels

In [50]:

text = u" Bộ Y tế chiều 6/4 ghi nhận 11 ca dương tính nCoV, đều được cách ly ngay sau nhập cảnh tại Long An, Cà Mau, Đà Nẵng và Tây Ninh."

a = FeatureExtraction().get_dense(text)

estimator =  FileReader(file_path='./model/another_topic/estimator_01.h5').load_estimator()  
    
#pred = estimator.predict([a]) 

 os.path.exists(DICTIONARY_PATH) == False: False
./model/another_topic/estimator_01.h5


UnpicklingError: invalid load key, 'H'.

In [52]:
import os
os.path.exist('./model/another_topic/dictionary.txt')

AttributeError: module 'posixpath' has no attribute 'exist'