In [1]:
from __future__ import print_function
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
# from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd
from pyvi import ViTokenizer
import re
import string
import codecs

In [2]:
VN_CHARS_LOWER = u'·∫°·∫£√£√†√°√¢·∫≠·∫ß·∫•·∫©·∫´ƒÉ·∫Ø·∫±·∫∑·∫≥·∫µ√≥√≤·ªç√µ·ªè√¥·ªô·ªï·ªó·ªì·ªë∆°·ªù·ªõ·ª£·ªü·ª°√©√®·∫ª·∫π·∫Ω√™·∫ø·ªÅ·ªá·ªÉ·ªÖ√∫√π·ª•·ªß≈©∆∞·ª±·ªØ·ª≠·ª´·ª©√≠√¨·ªã·ªâƒ©√Ω·ª≥·ª∑·ªµ·ªπƒë√∞'
VN_CHARS_UPPER = u'·∫†·∫¢√É√Ä√Å√Ç·∫¨·∫¶·∫§·∫®·∫™ƒÇ·∫Æ·∫∞·∫∂·∫≤·∫¥√ì√í·ªå√ï·ªé√î·ªò·ªî·ªñ·ªí·ªê∆†·ªú·ªö·ª¢·ªû·ª†√â√à·∫∫·∫∏·∫º√ä·∫æ·ªÄ·ªÜ·ªÇ·ªÑ√ö√ô·ª§·ª¶≈®∆Ø·ª∞·ªÆ·ª¨·ª™·ª®√ç√å·ªä·ªàƒ®√ù·ª≤·ª∂·ª¥·ª∏√êƒê'
VN_CHARS = VN_CHARS_LOWER + VN_CHARS_UPPER
def no_marks(s):
    __INTAB = [ch for ch in VN_CHARS]
    __OUTTAB = "a"*17 + "o"*17 + "e"*11 + "u"*11 + "i"*5 + "y"*5 + "d"*2
    __OUTTAB += "A"*17 + "O"*17 + "E"*11 + "U"*11 + "I"*5 + "Y"*5 + "D"*2
    __r = re.compile("|".join(__INTAB))
    __replaces_dict = dict(zip(__INTAB, __OUTTAB))
    result = __r.sub(lambda m: __replaces_dict[m.group(0)], s)
    return result

def normalize_text(text):

    #Remove c√°c k√Ω t·ª± k√©o d√†i: vd: ƒë·∫πppppppp
    text = re.sub(r'([A-Z])\1+', lambda m: m.group(1).upper(), text, flags=re.IGNORECASE)

    # Chuy·ªÉn th√†nh ch·ªØ th∆∞·ªùng
    text = text.lower()

    #Chu·∫©n h√≥a ti·∫øng Vi·ªát, x·ª≠ l√Ω emoj, chu·∫©n h√≥a ti·∫øng Anh, thu·∫≠t ng·ªØ
    replace_list = {
        '√≤a': 'o√†', '√≥a': 'o√°', '·ªèa': 'o·∫£', '√µa': 'o√£', '·ªça': 'o·∫°', '√≤e': 'o√®', '√≥e': 'o√©','·ªèe': 'o·∫ª',
        '√µe': 'o·∫Ω', '·ªçe': 'o·∫π', '√πy': 'u·ª≥', '√∫y': 'u√Ω', '·ªßy': 'u·ª∑', '≈©y': 'u·ªπ','·ª•y': 'u·ªµ', 'u·∫£': '·ªßa',
        'aÃâ': '·∫£', '√¥ÃÅ': '·ªë', 'u¬¥': '·ªë','√¥ÃÉ': '·ªó', '√¥ÃÄ': '·ªì', '√¥Ãâ': '·ªï', '√¢ÃÅ': '·∫•', '√¢ÃÉ': '·∫´', '√¢Ãâ': '·∫©',
        '√¢ÃÄ': '·∫ß', 'oÃâ': '·ªè', '√™ÃÄ': '·ªÅ','√™ÃÉ': '·ªÖ', 'ƒÉÃÅ': '·∫Ø', 'uÃâ': '·ªß', '√™ÃÅ': '·∫ø', '∆°Ãâ': '·ªü', 'iÃâ': '·ªâ',
        'eÃâ': '·∫ª', '√†k': u' √† ','aÀã': '√†', 'iÀã': '√¨', 'ƒÉ¬¥': '·∫Ø','∆∞Ãâ': '·ª≠', 'eÀú': '·∫Ω', 'yÀú': '·ªπ', 'a¬¥': '√°',
        
        #Chu·∫©n h√≥a 1 s·ªë sentiment words/English words
        ':))': '  positive ', ':)': ' positive ', '√¥ k√™i': ' ok ', 'okie': ' ok ', ' o k√™ ': ' ok ',
        'okey': ' ok ', '√¥k√™': ' ok ', 'oki': ' ok ', ' oke ':  ' ok ',' okay':' ok ','ok√™':' ok ',
        ' tks ': u' c√°m ∆°n ', 'thks': u' c√°m ∆°n ', 'thanks': u' c√°m ∆°n ', 'ths': u' c√°m ∆°n ', 'thank': u' c√°m ∆°n ',
        '‚≠ê': 'star ', '*': 'star ', 'üåü': 'star ', 'üéâ': u' 5star ',
        'kg ': u' kh√¥ng ','not': u' kh√¥ng ', u' kg ': u' kh√¥ng ', '"k ': u' kh√¥ng ',' kh ':u' kh√¥ng ','k√¥':u' kh√¥ng ','hok':u' kh√¥ng ',' kp ': u' kh√¥ng ph·∫£i ',u' k√¥ ': u' kh√¥ng ', '"ko ': u' kh√¥ng ', u' ko ': u' kh√¥ng ', u' k ': u' kh√¥ng ', 'khong': u' kh√¥ng ', u' hok ': u' kh√¥ng ',
        'he he': ' 5star ','hehe': ' 5star ','hihi': ' 5star ', 'haha': ' 5star ', 'hjhj': ' 5star ',
        ' lol ': ' 1star ',' cc ': ' 1star ','cute': u' d·ªÖ th∆∞∆°ng ','huhu': ' 1star ', ' vs ': u' v·ªõi ', 'wa': ' qu√° ', 'w√°': u' qu√°', 'j': u' g√¨ ', '‚Äú': ' ',
        ' sz ': u' c·ª° ', 'size': u' c·ª° ', u' ƒëx ': u' ƒë∆∞·ª£c ', 'dk': u' ƒë∆∞·ª£c ', 'dc': u' ƒë∆∞·ª£c ', 'ƒëk': u' ƒë∆∞·ª£c ',
        'ƒëc': u' ƒë∆∞·ª£c ','authentic': u' chu·∫©n ch√≠nh h√£ng ',u' aut ': u' chu·∫©n ch√≠nh h√£ng ', u' auth ': u' chu·∫©n ch√≠nh h√£ng ', 'thick': u' positive ', 'store': u' c·ª≠a h√†ng ',
        'shop': u' c·ª≠a h√†ng ', 'sp': u' s·∫£n ph·∫©m ', 'gud': u' t·ªët ','god': u' t·ªët ','wel done':' t·ªët ', 'good': u' t·ªët ', 'g√∫t': u' t·ªët ','great': u' t·ªët ',
        's·∫•u': u' x·∫•u ','gut': u' t·ªët ', u' tot ': u' t·ªët ', u' nice ': u' t·ªët ', 'perfect': 'r·∫•t t·ªët', 'bt': u' b√¨nh th∆∞·ªùng ',
        'time': u' th·ªùi gian ', 'q√°': u' qu√° ', u' ship ': u' giao h√†ng ', u' m ': u' m√¨nh ', u' mik ': u' m√¨nh ',
        '√™Ãâ': '·ªÉ', 'product': 's·∫£n ph·∫©m', 'quality': 'ch·∫•t l∆∞·ª£ng','chat':' ch·∫•t ', 'excelent': 'ho√†n h·∫£o', 'bad': 't·ªá','fresh': ' t∆∞∆°i ','sad': ' t·ªá ',
        'date': u' h·∫°n s·ª≠ d·ª•ng ', 'hsd': u' h·∫°n s·ª≠ d·ª•ng ','quickly': u' nhanh ', 'quick': u' nhanh ','fast': u' nhanh ','delivery': u' giao h√†ng ',u' s√≠p ': u' giao h√†ng ',
        'beautiful': u' ƒë·∫πp tuy·ªát v·ªùi ', u' tl ': u' tr·∫£ l·ªùi ', u' r ': u' r·ªìi ', u' shopE ': u' c·ª≠a h√†ng ',u' order ': u' ƒë·∫∑t h√†ng ',
        'ch·∫•t lg': u' ch·∫•t l∆∞·ª£ng ',u' sd ': u' s·ª≠ d·ª•ng ',u' dt ': u' ƒëi·ªán tho·∫°i ',u' nt ': u' nh·∫Øn tin ',u' tl ': u' tr·∫£ l·ªùi ',u' s√†i ': u' x√†i ',u'bjo':u' bao gi·ªù ',
        'thik': u' th√≠ch ',u' sop ': u' c·ª≠a h√†ng ', ' fb ': ' facebook ', ' face ': ' facebook ', ' very ': u' r·∫•t ',u'qu·∫£ ng ':u' qu·∫£ng  ',
        'dep': u' ƒë·∫πp ',u' xau ': u' x·∫•u ','delicious': u' ngon ', u'h√†g': u' h√†ng ', u'q·ªßa': u' qu·∫£ ',
        'iu': u' y√™u ','fake': u' gi·∫£ m·∫°o ', 'trl': 'tr·∫£ l·ªùi', '><': u' 5star ',
        ' por ': u' t·ªá ',' poor ': u' t·ªá ', 'ib':u' nh·∫Øn tin ', 'rep':u' tr·∫£ l·ªùi ',u'fback':' feedback ','fedback':' feedback ',
        # quy c√°c icon v·ªÅ 2 lo·∫°i 1 sao v√† 5 sao:
        "üëπ": "1star", "üëª": "5star", "üíÉ": "5star",'ü§ô': ' 5star ', 'üëç': ' 5star ',
        "üíÑ": "5star", "üíé": "5star", "üí©": "5star","üòï": "1star", "üò±": "1star", "üò∏": "5star",
        "üòæ": "1star", "üö´": "1star",  "ü§¨": "1star","üßö": "5star", "üß°": "5star",'üê∂':' 5star ',
        'üëé': ' 1star ', 'üò£': ' 1star ','‚ú®': ' 5star ', '‚ù£': ' 5star ','‚òÄ': ' 5star ',
        '‚ô•': ' 5star ', 'ü§©': ' 5star ', 'like': ' 5star ', 'üíå': ' 5star ',
        'ü§£': ' 5star ', 'üñ§': ' 5star ', 'ü§§': ' 5star ', ':(': ' 1star ', 'üò¢': ' 1star ',
        '‚ù§': ' 5star ', 'üòç': ' 5star ', 'üòò': ' 5star ', 'üò™': ' 1star ', 'üòä': ' 5star ',
        '?': ' ? ', 'üòÅ': ' 5star ', 'üíñ': ' 5star ', 'üòü': ' 1star ', 'üò≠': ' 1star ',
        'üíØ': ' 5star ', 'üíó': ' 5star ', '‚ô°': ' 5star ', 'üíú': ' 5star ', 'ü§ó': ' 5star ',
        '^^': ' 5star ', 'üò®': ' 1star ', '‚ò∫': ' 5star ', 'üíã': ' 5star ', 'üëå': ' 5star ',
        'üòñ': ' 1star ', 'üòÄ': ' 5star ', ':((': ' 1star ', 'üò°': ' 1star ', 'üò†': ' 1star ',
        'üòí': ' 1star ', 'üôÇ': ' 5star ', 'üòè': ' 1star ', 'üòù': ' 5star ', 'üòÑ': ' 5star ',
        'üòô': ' 5star ', 'üò§': ' 1star ', 'üòé': ' 5star ', 'üòÜ': ' 5star ', 'üíö': ' 5star ',
        '‚úå': ' 5star ', 'üíï': ' 5star ', 'üòû': ' 1star ', 'üòì': ' 1star ', 'Ô∏èüÜóÔ∏è': ' 5star ',
        'üòâ': ' 5star ', 'üòÇ': ' 5star ', ':v': '  5star ', '=))': '  5star ', 'üòã': ' 5star ',
        'üíì': ' 5star ', 'üòê': ' 1star ', ':3': ' 5star ', 'üò´': ' 1star ', 'üò•': ' 1star ',
        'üòÉ': ' 5star ', 'üò¨': ' üò¨ ', 'üòå': ' üòå ', 'üíõ': ' 5star ', 'ü§ù': ' 5star ', 'üéà': ' 5star ',
        'üòó': ' 5star ', 'ü§î': ' 1star ', 'üòë': ' 1star ', 'üî•': ' 1star ', 'üôè': ' 1star ',
        'üÜó': ' 5star ', 'üòª': ' 5star ', 'üíô': ' 5star ', 'üíü': ' 5star ',
        'üòö': ' 5star ', '‚ùå': ' 1star ', 'üëè': ' 5star ', ';)': ' 5star ', '<3': ' 5star ',
        'üåù': ' 5star ',  'üå∑': ' 5star ', 'üå∏': ' 5star ', 'üå∫': ' 5star ',
        'üåº': ' 5star ', 'üçì': ' 5star ', 'üêÖ': ' 5star ', 'üêæ': ' 5star ', 'üëâ': ' 5star ',
        'üíê': ' 5star ', 'üíû': ' 5star ', 'üí•': ' 5star ', 'üí™': ' 5star ',
        'üí∞': ' 5star ',  'üòá': ' 5star ', 'üòõ': ' 5star ', 'üòú': ' 5star ',
        'üôÉ': ' 5star ', 'ü§ë': ' 5star ', 'ü§™': ' 5star ','‚òπ': ' 1star ',  'üíÄ': ' 1star ',
        'üòî': ' 1star ', 'üòß': ' 1star ', 'üò©': ' 1star ', 'üò∞': ' 1star ', 'üò≥': ' 1star ',
        'üòµ': ' 1star ', 'üò∂': ' 1star ', 'üôÅ': ' 1star ',
        #
        'nƒÉm sao': ' 5star ','5 sao': ' 5star ', '4 sao': ' 4star ','b·ªën sao': ' 4star ','3 sao': ' 3star ',
        'ba sao': ' 3star ', '2 sao': ' 2star ', 'hai sao': ' 2star ','1 sao':' 1star ','m·ªôt sao':' 1star ',
        }

    for k, v in replace_list.items():
        text = text.replace(k, v)

    # chuyen punctuation th√†nh space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    text = text.translate(translator)

    text = ViTokenizer.tokenize(text)
    texts = text.split()
    len_text = len(texts)

    texts = [t.replace('_', ' ') for t in texts]
    

    text = u' '.join(texts)

    #remove n·ªët nh·ªØng k√Ω t·ª± th·ª´a th√£i
    text = text.replace(u'"', u' ')
    text = text.replace(u'Ô∏è', u'')
    text = text.replace('üèª','')
    return text

In [33]:
class DataSource(object):
  def load_data(self, filename):
    star= []
    review = []

    with open(filename, 'r') as fp:
        line = fp.readline()
        while line:
            star.append(line[0])
            review.append(line[2:])
            line = fp.readline()
    
    return star, review
    
  def load_test_data(self, filename):
    star= []
    review = []

    with open(filename, 'r') as fp:
        line = fp.readline()
        while line:
            star.append(line[1])
            review.append(line[3:-1])
            line = fp.readline()
    
    return star, review
  def load_data_2(self, filename):
    star= []
    review = []
    with open(filename, 'r') as csv_file:
      i = 0
      for row in csv_file:
          row = row.split('<fff>')
          star.append(row[0])
          review.append(row[1])
    return star, review
  def transform_to_dataset(self, x_set, y_set):
    X, y = [], []
    for document, topic in zip(list(x_set), list(y_set)):
        document = normalize_text(document)
        X.append(document.strip())
        y.append(topic)
    return X, y
  def transform_to_dataset_with_augmentation(self, x_set, y_set):
    X, y = [], []
    for document, topic in zip(list(x_set), list(y_set)):
        document = normalize_text(document)
        X.append(document.strip())
        y.append(topic)
        # Augmentation b·∫±ng c√°ch remove d·∫•u ti·∫øng Vi·ªát
        X.append(no_marks(document))
        y.append(topic)
    return X, y
  def get_predicted_result(self, star, review, filename):
    f = open(filename, "a")
    for i in range(len(star)):
      line = str(star[i]) + " " + review[i] + "\n"
      f.write(line)
    f.close()


In [34]:
ds = DataSource()
star, review = ds.load_data('train.txt')
star2, review2 = ds.load_data_2('bag_text.txt')
star_test, review_test = ds.load_test_data('test.txt')


In [36]:
#  X_train, X_test, y_train, y_test = train_test_split(review, star, test_size=0.6, random_state=42)

X_train = review
y_train = star
X_train = X_train * 50 +  review2 * 50 
y_train = y_train * 50 +  star2 * 50
X_train, y_train = ds.transform_to_dataset_with_augmentation(X_train,y_train)
X_test, y_test = ds.transform_to_dataset(review_test, star_test)

In [38]:
stop_ws = (u'r·∫±ng',u'th√¨',u'l√†')
#Try some models
classifiers = [
            MultinomialNB(),
            # DecisionTreeClassifier(),
            # LogisticRegression(),
            SGDClassifier(),
            LinearSVC(fit_intercept = True,multi_class='crammer_singer', C=1),
            # RandomForestClassifier(),
        ]

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

vectorizer = TfidfVectorizer(ngram_range=(1,4))
vectors = vectorizer.fit_transform(X_train)
vectors1 = vectorizer.transform(X_test)
i = 1
for classifier in classifiers:
    filename = "result_" + str(i) + ".txt"
#     steps = []
#     steps.append(('CountVectorizer', CountVectorizer(ngram_range=(1,3),stop_words=stop_ws)))
#     steps.append(('tfidf', TfidfTransformer(use_idf=False, sublinear_tf = True,norm='l2',smooth_idf=True)))
#     steps.append(('classifier', classifier))
#     clf = Pipeline(steps)
#     clf.fit(X_train, y_train)
    
    classifier.fit(vectors, y_train)
    y_pred = classifier.predict(vectors1)
    print(Counter(y_pred))
    ds.get_predicted_result(y_pred, review_test, filename)
    i = i+1
#     score = accuracy_score(y_test, y_pred)
#     print(classifier)
#     print(score)
#     print(confusion_matrix(y_test, y_pred))
#     print(cross_score = cross_val_score(classifier, X_train,y_train, cv=5))

Counter({'5': 450, '4': 99, '3': 31, '1': 19, '2': 1})
Counter({'5': 400, '4': 114, '3': 61, '1': 24, '2': 1})
Counter({'5': 412, '4': 108, '3': 55, '1': 23, '2': 2})


