In [1]:
from __future__ import print_function
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
# from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd
from pyvi import ViTokenizer
import re
import string
import codecs

In [2]:
VN_CHARS_LOWER = u'ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđð'
VN_CHARS_UPPER = u'ẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸÐĐ'
VN_CHARS = VN_CHARS_LOWER + VN_CHARS_UPPER
def no_marks(s):
    __INTAB = [ch for ch in VN_CHARS]
    __OUTTAB = "a"*17 + "o"*17 + "e"*11 + "u"*11 + "i"*5 + "y"*5 + "d"*2
    __OUTTAB += "A"*17 + "O"*17 + "E"*11 + "U"*11 + "I"*5 + "Y"*5 + "D"*2
    __r = re.compile("|".join(__INTAB))
    __replaces_dict = dict(zip(__INTAB, __OUTTAB))
    result = __r.sub(lambda m: __replaces_dict[m.group(0)], s)
    return result

def normalize_text(text):

    #Remove các ký tự kéo dài: vd: đẹppppppp
    text = re.sub(r'([A-Z])\1+', lambda m: m.group(1).upper(), text, flags=re.IGNORECASE)

    # Chuyển thành chữ thường
    text = text.lower()

    #Chuẩn hóa tiếng Việt, xử lý emoj, chuẩn hóa tiếng Anh, thuật ngữ
    replace_list = {
        'òa': 'oà', 'óa': 'oá', 'ỏa': 'oả', 'õa': 'oã', 'ọa': 'oạ', 'òe': 'oè', 'óe': 'oé','ỏe': 'oẻ',
        'õe': 'oẽ', 'ọe': 'oẹ', 'ùy': 'uỳ', 'úy': 'uý', 'ủy': 'uỷ', 'ũy': 'uỹ','ụy': 'uỵ', 'uả': 'ủa',
        'ả': 'ả', 'ố': 'ố', 'u´': 'ố','ỗ': 'ỗ', 'ồ': 'ồ', 'ổ': 'ổ', 'ấ': 'ấ', 'ẫ': 'ẫ', 'ẩ': 'ẩ',
        'ầ': 'ầ', 'ỏ': 'ỏ', 'ề': 'ề','ễ': 'ễ', 'ắ': 'ắ', 'ủ': 'ủ', 'ế': 'ế', 'ở': 'ở', 'ỉ': 'ỉ',
        'ẻ': 'ẻ', 'àk': u' à ','aˋ': 'à', 'iˋ': 'ì', 'ă´': 'ắ','ử': 'ử', 'e˜': 'ẽ', 'y˜': 'ỹ', 'a´': 'á',
        
        #Chuẩn hóa 1 số sentiment words/English words
        ':))': '  positive ', ':)': ' positive ', 'ô kêi': ' ok ', 'okie': ' ok ', ' o kê ': ' ok ',
        'okey': ' ok ', 'ôkê': ' ok ', 'oki': ' ok ', ' oke ':  ' ok ',' okay':' ok ','okê':' ok ',
        ' tks ': u' cám ơn ', 'thks': u' cám ơn ', 'thanks': u' cám ơn ', 'ths': u' cám ơn ', 'thank': u' cám ơn ',
        '⭐': 'star ', '*': 'star ', '🌟': 'star ', '🎉': u' 5star ',
        'kg ': u' không ','not': u' không ', u' kg ': u' không ', '"k ': u' không ',' kh ':u' không ','kô':u' không ','hok':u' không ',' kp ': u' không phải ',u' kô ': u' không ', '"ko ': u' không ', u' ko ': u' không ', u' k ': u' không ', 'khong': u' không ', u' hok ': u' không ',
        'he he': ' 5star ','hehe': ' 5star ','hihi': ' 5star ', 'haha': ' 5star ', 'hjhj': ' 5star ',
        ' lol ': ' 1star ',' cc ': ' 1star ','cute': u' dễ thương ','huhu': ' 1star ', ' vs ': u' với ', 'wa': ' quá ', 'wá': u' quá', 'j': u' gì ', '“': ' ',
        ' sz ': u' cỡ ', 'size': u' cỡ ', u' đx ': u' được ', 'dk': u' được ', 'dc': u' được ', 'đk': u' được ',
        'đc': u' được ','authentic': u' chuẩn chính hãng ',u' aut ': u' chuẩn chính hãng ', u' auth ': u' chuẩn chính hãng ', 'thick': u' positive ', 'store': u' cửa hàng ',
        'shop': u' cửa hàng ', 'sp': u' sản phẩm ', 'gud': u' tốt ','god': u' tốt ','wel done':' tốt ', 'good': u' tốt ', 'gút': u' tốt ','great': u' tốt ',
        'sấu': u' xấu ','gut': u' tốt ', u' tot ': u' tốt ', u' nice ': u' tốt ', 'perfect': 'rất tốt', 'bt': u' bình thường ',
        'time': u' thời gian ', 'qá': u' quá ', u' ship ': u' giao hàng ', u' m ': u' mình ', u' mik ': u' mình ',
        'ể': 'ể', 'product': 'sản phẩm', 'quality': 'chất lượng','chat':' chất ', 'excelent': 'hoàn hảo', 'bad': 'tệ','fresh': ' tươi ','sad': ' tệ ',
        'date': u' hạn sử dụng ', 'hsd': u' hạn sử dụng ','quickly': u' nhanh ', 'quick': u' nhanh ','fast': u' nhanh ','delivery': u' giao hàng ',u' síp ': u' giao hàng ',
        'beautiful': u' đẹp tuyệt vời ', u' tl ': u' trả lời ', u' r ': u' rồi ', u' shopE ': u' cửa hàng ',u' order ': u' đặt hàng ',
        'chất lg': u' chất lượng ',u' sd ': u' sử dụng ',u' dt ': u' điện thoại ',u' nt ': u' nhắn tin ',u' tl ': u' trả lời ',u' sài ': u' xài ',u'bjo':u' bao giờ ',
        'thik': u' thích ',u' sop ': u' cửa hàng ', ' fb ': ' facebook ', ' face ': ' facebook ', ' very ': u' rất ',u'quả ng ':u' quảng  ',
        'dep': u' đẹp ',u' xau ': u' xấu ','delicious': u' ngon ', u'hàg': u' hàng ', u'qủa': u' quả ',
        'iu': u' yêu ','fake': u' giả mạo ', 'trl': 'trả lời', '><': u' 5star ',
        ' por ': u' tệ ',' poor ': u' tệ ', 'ib':u' nhắn tin ', 'rep':u' trả lời ',u'fback':' feedback ','fedback':' feedback ',
        # quy các icon về 2 loại 1 sao và 5 sao:
        "👹": "1star", "👻": "5star", "💃": "5star",'🤙': ' 5star ', '👍': ' 5star ',
        "💄": "5star", "💎": "5star", "💩": "5star","😕": "1star", "😱": "1star", "😸": "5star",
        "😾": "1star", "🚫": "1star",  "🤬": "1star","🧚": "5star", "🧡": "5star",'🐶':' 5star ',
        '👎': ' 1star ', '😣': ' 1star ','✨': ' 5star ', '❣': ' 5star ','☀': ' 5star ',
        '♥': ' 5star ', '🤩': ' 5star ', 'like': ' 5star ', '💌': ' 5star ',
        '🤣': ' 5star ', '🖤': ' 5star ', '🤤': ' 5star ', ':(': ' 1star ', '😢': ' 1star ',
        '❤': ' 5star ', '😍': ' 5star ', '😘': ' 5star ', '😪': ' 1star ', '😊': ' 5star ',
        '?': ' ? ', '😁': ' 5star ', '💖': ' 5star ', '😟': ' 1star ', '😭': ' 1star ',
        '💯': ' 5star ', '💗': ' 5star ', '♡': ' 5star ', '💜': ' 5star ', '🤗': ' 5star ',
        '^^': ' 5star ', '😨': ' 1star ', '☺': ' 5star ', '💋': ' 5star ', '👌': ' 5star ',
        '😖': ' 1star ', '😀': ' 5star ', ':((': ' 1star ', '😡': ' 1star ', '😠': ' 1star ',
        '😒': ' 1star ', '🙂': ' 5star ', '😏': ' 1star ', '😝': ' 5star ', '😄': ' 5star ',
        '😙': ' 5star ', '😤': ' 1star ', '😎': ' 5star ', '😆': ' 5star ', '💚': ' 5star ',
        '✌': ' 5star ', '💕': ' 5star ', '😞': ' 1star ', '😓': ' 1star ', '️🆗️': ' 5star ',
        '😉': ' 5star ', '😂': ' 5star ', ':v': '  5star ', '=))': '  5star ', '😋': ' 5star ',
        '💓': ' 5star ', '😐': ' 1star ', ':3': ' 5star ', '😫': ' 1star ', '😥': ' 1star ',
        '😃': ' 5star ', '😬': ' 😬 ', '😌': ' 😌 ', '💛': ' 5star ', '🤝': ' 5star ', '🎈': ' 5star ',
        '😗': ' 5star ', '🤔': ' 1star ', '😑': ' 1star ', '🔥': ' 1star ', '🙏': ' 1star ',
        '🆗': ' 5star ', '😻': ' 5star ', '💙': ' 5star ', '💟': ' 5star ',
        '😚': ' 5star ', '❌': ' 1star ', '👏': ' 5star ', ';)': ' 5star ', '<3': ' 5star ',
        '🌝': ' 5star ',  '🌷': ' 5star ', '🌸': ' 5star ', '🌺': ' 5star ',
        '🌼': ' 5star ', '🍓': ' 5star ', '🐅': ' 5star ', '🐾': ' 5star ', '👉': ' 5star ',
        '💐': ' 5star ', '💞': ' 5star ', '💥': ' 5star ', '💪': ' 5star ',
        '💰': ' 5star ',  '😇': ' 5star ', '😛': ' 5star ', '😜': ' 5star ',
        '🙃': ' 5star ', '🤑': ' 5star ', '🤪': ' 5star ','☹': ' 1star ',  '💀': ' 1star ',
        '😔': ' 1star ', '😧': ' 1star ', '😩': ' 1star ', '😰': ' 1star ', '😳': ' 1star ',
        '😵': ' 1star ', '😶': ' 1star ', '🙁': ' 1star ',
        #
        'năm sao': ' 5star ','5 sao': ' 5star ', '4 sao': ' 4star ','bốn sao': ' 4star ','3 sao': ' 3star ',
        'ba sao': ' 3star ', '2 sao': ' 2star ', 'hai sao': ' 2star ','1 sao':' 1star ','một sao':' 1star ',
        }

    for k, v in replace_list.items():
        text = text.replace(k, v)

    # chuyen punctuation thành space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    text = text.translate(translator)

    text = ViTokenizer.tokenize(text)
    texts = text.split()
    len_text = len(texts)

    texts = [t.replace('_', ' ') for t in texts]
    

    text = u' '.join(texts)

    #remove nốt những ký tự thừa thãi
    text = text.replace(u'"', u' ')
    text = text.replace(u'️', u'')
    text = text.replace('🏻','')
    return text

In [33]:
class DataSource(object):
  def load_data(self, filename):
    star= []
    review = []

    with open(filename, 'r') as fp:
        line = fp.readline()
        while line:
            star.append(line[0])
            review.append(line[2:])
            line = fp.readline()
    
    return star, review
    
  def load_test_data(self, filename):
    star= []
    review = []

    with open(filename, 'r') as fp:
        line = fp.readline()
        while line:
            star.append(line[1])
            review.append(line[3:-1])
            line = fp.readline()
    
    return star, review
  def load_data_2(self, filename):
    star= []
    review = []
    with open(filename, 'r') as csv_file:
      i = 0
      for row in csv_file:
          row = row.split('<fff>')
          star.append(row[0])
          review.append(row[1])
    return star, review
  def transform_to_dataset(self, x_set, y_set):
    X, y = [], []
    for document, topic in zip(list(x_set), list(y_set)):
        document = normalize_text(document)
        X.append(document.strip())
        y.append(topic)
    return X, y
  def transform_to_dataset_with_augmentation(self, x_set, y_set):
    X, y = [], []
    for document, topic in zip(list(x_set), list(y_set)):
        document = normalize_text(document)
        X.append(document.strip())
        y.append(topic)
        # Augmentation bằng cách remove dấu tiếng Việt
        X.append(no_marks(document))
        y.append(topic)
    return X, y
  def get_predicted_result(self, star, review, filename):
    f = open(filename, "a")
    for i in range(len(star)):
      line = str(star[i]) + " " + review[i] + "\n"
      f.write(line)
    f.close()


In [34]:
ds = DataSource()
star, review = ds.load_data('train.txt')
star2, review2 = ds.load_data_2('bag_text.txt')
star_test, review_test = ds.load_test_data('test.txt')


In [36]:
#  X_train, X_test, y_train, y_test = train_test_split(review, star, test_size=0.6, random_state=42)

X_train = review
y_train = star
X_train = X_train * 50 +  review2 * 50 
y_train = y_train * 50 +  star2 * 50
X_train, y_train = ds.transform_to_dataset_with_augmentation(X_train,y_train)
X_test, y_test = ds.transform_to_dataset(review_test, star_test)

In [38]:
stop_ws = (u'rằng',u'thì',u'là')
#Try some models
classifiers = [
            MultinomialNB(),
            # DecisionTreeClassifier(),
            # LogisticRegression(),
            SGDClassifier(),
            LinearSVC(fit_intercept = True,multi_class='crammer_singer', C=1),
            # RandomForestClassifier(),
        ]

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

vectorizer = TfidfVectorizer(ngram_range=(1,4))
vectors = vectorizer.fit_transform(X_train)
vectors1 = vectorizer.transform(X_test)
i = 1
for classifier in classifiers:
    filename = "result_" + str(i) + ".txt"
#     steps = []
#     steps.append(('CountVectorizer', CountVectorizer(ngram_range=(1,3),stop_words=stop_ws)))
#     steps.append(('tfidf', TfidfTransformer(use_idf=False, sublinear_tf = True,norm='l2',smooth_idf=True)))
#     steps.append(('classifier', classifier))
#     clf = Pipeline(steps)
#     clf.fit(X_train, y_train)
    
    classifier.fit(vectors, y_train)
    y_pred = classifier.predict(vectors1)
    print(Counter(y_pred))
    ds.get_predicted_result(y_pred, review_test, filename)
    i = i+1
#     score = accuracy_score(y_test, y_pred)
#     print(classifier)
#     print(score)
#     print(confusion_matrix(y_test, y_pred))
#     print(cross_score = cross_val_score(classifier, X_train,y_train, cv=5))

Counter({'5': 450, '4': 99, '3': 31, '1': 19, '2': 1})
Counter({'5': 400, '4': 114, '3': 61, '1': 24, '2': 1})
Counter({'5': 412, '4': 108, '3': 55, '1': 23, '2': 2})


