In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

path = '/content/drive/MyDrive/Dataset/UIT-VSFC'
X_train = pd.read_csv(path + '/train/sents.txt',  sep='\n', header=None, index_col=None)
X_train = X_train.iloc[:, 0]
y_train = pd.read_csv(path + '/train/sentiments.txt',  sep='\n', header=None, index_col=None)
y_train_topic = pd.read_csv(path + '/train/topics.txt',  sep='\n', header=None, index_col=None)

X_dev = pd.read_csv(path + '/dev/sents.txt', sep='\n', header=None, index_col=None)
X_dev = X_dev.iloc[:, 0]
y_dev = pd.read_csv(path + '/dev/sentiments.txt', sep='\n', header=None, index_col=None)
y_dev_topic = pd.read_csv(path + '/dev/topics.txt',  sep='\n', header=None, index_col=None)


X_test = pd.read_csv(path + '/test/sents.txt',  sep='\n', header=None, index_col=None)
X_test = X_test.iloc[:, 0]
y_test = pd.read_csv(path + '/test/sentiments.txt',  sep='\n', header=None, index_col=None)
y_test_topic = pd.read_csv(path + '/test/topics.txt',  sep='\n', header=None, index_col=None)


y_train = y_train.values.flatten()
y_dev = y_dev.values.flatten()
y_test = y_test.values.flatten()

In [3]:
X_train.head(10)

0                            slide giáo trình đầy đủ .
1       nhiệt tình giảng dạy , gần gũi với sinh viên .
2                 đi học đầy đủ full điểm chuyên cần .
3    chưa áp dụng công nghệ thông tin và các thiết ...
4    thầy giảng bài hay , có nhiều bài tập ví dụ ng...
5    giảng viên đảm bảo thời gian lên lớp , tích cự...
6    em sẽ nợ môn này , nhưng em sẽ học lại ở các h...
7    thời lượng học quá dài , không đảm bảo tiếp th...
8    nội dung môn học có phần thiếu trọng tâm , hầu...
9    cần nói rõ hơn bằng cách trình bày lên bảng th...
Name: 0, dtype: object

# Naive Bayes 

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer(analyzer='word', ngram_range=(2,2))
tf_idf.fit(X_train)

X_train_encoded = tf_idf.transform(X_train)
X_dev_encoded = tf_idf.transform(X_dev)
X_test_encoded = tf_idf.transform(X_test)

In [5]:
X_train_encoded.shape

(11426, 31384)

In [6]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_encoded, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
y_pred_dev = model.predict(X_dev_encoded)
y_pred_test = model.predict(X_test_encoded)

In [8]:
from sklearn.metrics import f1_score

print(f1_score(y_dev, y_pred_dev, average='micro')*100)
print(f1_score(y_test, y_pred_test, average='micro')*100)

87.17624763108023
86.35502210991788


# Gensim - Build simple word embedding

In [9]:
pip install gensim



In [10]:
pip install pyvi

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[K     |████████████████████████████████| 8.5 MB 4.6 MB/s 
[?25hCollecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743 kB)
[K     |████████████████████████████████| 743 kB 20.4 MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.7 pyvi-0.1.1 sklearn-crfsuite-0.3.6


In [11]:
from pyvi import ViTokenizer

sentences = X_train.values

tokenized_sentences = []

for s in sentences:
    tokenized_sentences.append(ViTokenizer.tokenize(s).split())

In [12]:
import multiprocessing
from gensim.models import Word2Vec

w2v_model = Word2Vec(min_count=20, window=2, size=300)
w2v_model.build_vocab(tokenized_sentences)

In [13]:
w2v_model.train(tokenized_sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

(2134262, 3851460)

In [14]:
w2v_model.wv.vocab

{'!': <gensim.models.keyedvectors.Vocab at 0x7facd9e70410>,
 '"': <gensim.models.keyedvectors.Vocab at 0x7facd9e819d0>,
 '%': <gensim.models.keyedvectors.Vocab at 0x7facd9e7c6d0>,
 '(': <gensim.models.keyedvectors.Vocab at 0x7facd9e7c790>,
 ')': <gensim.models.keyedvectors.Vocab at 0x7facd9e7c7d0>,
 ',': <gensim.models.keyedvectors.Vocab at 0x7facd9e65810>,
 '-': <gensim.models.keyedvectors.Vocab at 0x7facd9e77ad0>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7facd9ebde10>,
 '1': <gensim.models.keyedvectors.Vocab at 0x7facd9e81190>,
 '2': <gensim.models.keyedvectors.Vocab at 0x7facd9e7ae50>,
 '3': <gensim.models.keyedvectors.Vocab at 0x7facd9e816d0>,
 '30': <gensim.models.keyedvectors.Vocab at 0x7facd9e77a90>,
 '4': <gensim.models.keyedvectors.Vocab at 0x7facd9e84f50>,
 '5': <gensim.models.keyedvectors.Vocab at 0x7facd9e7a2d0>,
 '?': <gensim.models.keyedvectors.Vocab at 0x7facd9e81f10>,
 'ai': <gensim.models.keyedvectors.Vocab at 0x7facd9e7aa10>,
 'anh': <gensim.models.keyedvectors.Vo

In [15]:
w2v_model.wv.most_similar(positive=["giảng_dạy", "đồ_án"], negative=["kiểm_tra"], topn=1)

[('đối_với', 0.39585497975349426)]

In [16]:
w2v_model.wv.save_word2vec_format('w2v_vsfc.txt', binary=False)

In [17]:
print(w2v_model['dạy'])

[-0.16160615 -0.08855243  0.06048952 -0.26187405 -0.16167095 -0.30898818
  0.3246685   0.14753404  0.05460394  0.48221728  0.01714414 -0.03768282
  0.14862181  0.32211828 -0.3155511  -0.13191213 -0.3149285  -0.09868582
 -0.20129257  0.1632616  -0.20557818  0.06808359  0.22834508 -0.01325125
 -0.17402202 -0.5377645   0.00652443  0.05708453  0.04776882 -0.08993167
 -0.50745416  0.20603533  0.43503466 -0.35979775  0.14840536 -0.38188317
  0.19013561 -0.19972058 -0.17524901 -0.4007956  -0.42311674  0.10964311
  0.01922066  0.39602038  0.25856307  0.19007425 -0.16497019  0.6031791
 -0.01105115  0.20601086  0.2915155   0.05493049  0.27975392 -0.09526739
 -0.85252076 -0.34388736 -0.59686023  0.07490172  0.39078945 -0.17759736
 -0.4644142  -0.05498283  0.08618779  0.25248143  0.5250795   0.27546734
  0.41831028  0.0382987  -0.08985215  0.01436699  0.25648665  0.21720615
 -0.18024462  0.13555603  0.11587378 -0.08814624  0.146718    0.2990964
 -0.02543836  0.47849566 -0.37007913  0.2957891  -0.0

  """Entry point for launching an IPython kernel.


# Word embeding 

In [18]:
# make vocabulary 
from pyvi import ViTokenizer

V = []

for t in X_train:
    tokenized_sentence = ViTokenizer.tokenize(t)
    V = V + tokenized_sentence.split()

V = list(set(V))

In [19]:
len(V)

3704

## Cach 1: Tu build 

In [20]:
# Building dictionary 
word_to_index = {w : (i+2) for i, w in enumerate(V)}

word_to_index['UNK'] = 1
word_to_index['PAD'] = 0

# Build index2w
index_to_word = {i: w for w, i in word_to_index.items()}

In [21]:
word_to_index

{'wzjwz323': 2,
 'cách_thức': 3,
 'nhập_tâm': 4,
 'giữa_chừng': 5,
 'nâng': 6,
 'tham_gia': 7,
 'gây_rối': 8,
 'lồng_tiếng': 9,
 'con': 10,
 'chắc_chắn': 11,
 'giảng_viên': 12,
 'phone': 13,
 'ở': 14,
 'đối_phó': 15,
 '2005': 16,
 'vui': 17,
 'wzjwz46': 18,
 'trau_chuốt': 19,
 'sướng': 20,
 'gán': 21,
 'speaking': 22,
 'bản': 23,
 'cuối': 24,
 'kể': 25,
 'wzjwz60': 26,
 'học_hành': 27,
 'tiêp': 28,
 'dư': 29,
 'chắc_hẳn': 30,
 'tăng_cường_độ': 31,
 'phí': 32,
 'sôi_động': 33,
 'rối': 34,
 'tận': 35,
 'lãnh_đạm': 36,
 'tự_ý': 37,
 'kế_tiếp': 38,
 'việt': 39,
 'nhanh': 40,
 'đươc': 41,
 'lau': 42,
 'đói': 43,
 'nhà_hàng': 44,
 'rung': 45,
 'cẩn_thận': 46,
 'bấm': 47,
 'gia_trưởng': 48,
 'tương_thích': 49,
 'cao_học': 50,
 'vừa_ý': 51,
 'mông_lung': 52,
 'người_làm': 53,
 'thu': 54,
 'bẩn': 55,
 'kho': 56,
 'học_bổng': 57,
 'lỳ': 58,
 'tuổi': 59,
 'vựa': 60,
 'đa_số': 61,
 'demo': 62,
 'wzjwz156': 63,
 'kiến_thức': 64,
 'cân_bằng': 65,
 '8': 66,
 'học_sinh_viên': 67,
 'chắp_vá': 68,
 'hoa

In [22]:
X_train.values

array(['slide giáo trình đầy đủ .',
       'nhiệt tình giảng dạy , gần gũi với sinh viên .',
       'đi học đầy đủ full điểm chuyên cần .', ...,
       'giao bài tập quá nhiều .', 'giáo viên dạy dễ hiểu , nhiệt tình .',
       'gói gọn doubledot hay , tận tình , phù hợp với mọi trình độ cũng như nhu cầu môn học .'],
      dtype=object)

In [23]:
from pyvi import ViTokenizer

ViTokenizer.tokenize("slide giáo trình đầy đủ ")

'slide giáo_trình đầy_đủ'

In [24]:
t = "slide giáo_trình đầy_đủ"
t.split()

['slide', 'giáo_trình', 'đầy_đủ']

In [25]:
word_to_index['đầy_đủ']

644

In [26]:
t_encode = [1996, 1097,  3242, 0, 0 ,0, 0 ]

In [27]:
word_to_index["PAD"]

0

In [28]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pyvi import ViTokenizer

max_len = 100

def encoding(X):
    sentences = []
    
    for t in X:
        # tach tu 
        tokenized_sentence = ViTokenizer.tokenize(t)
        sentences.append(tokenized_sentence)
    
    X = []
    for s in sentences:
        sent = []
        for w in s.split():
            try:
                sent.append(word_to_index[w])
            except:
                sent.append(word_to_index["UNK"])
        X.append(sent)
    
    # Padding du lieu theo do dai cau (maxlen)
    X = pad_sequences(maxlen = max_len, sequences = X, padding = "post", value = word_to_index["PAD"])

    return X

## Cach 2: dung ham co san trong Keras 

In [29]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pyvi import ViTokenizer
from keras.preprocessing.text import Tokenizer

max_len = 100


word_tokenizer = Tokenizer(oov_token=-1)
word_tokenizer.fit_on_texts(X_train)

word_2_index = word_tokenizer.word_index
word_2_index['pad'] = 0
word_2_index['unk'] = -1


index_to_word = {i: w for w, i in word_to_index.items()}

def encoding(X):
    sentences = []
    
    for t in X:
        tokenized_sentence = ViTokenizer.tokenize(t)
        sentences.append(tokenized_sentence)
    
    X = word_tokenizer.texts_to_sequences(sentences)
    X = pad_sequences(maxlen = max_len, sequences = X, padding = "post", value = word_2_index['pad'])

    return X

## Build model 

In [30]:
X_train_encoded = encoding(X_train)
X_dev_encoded = encoding(X_dev)
X_test_encoded = encoding(X_test)

In [31]:
X_train_encoded

array([[ 120,   46,   52, ...,    0,    0,    0],
       [  14,    9,    3, ...,    0,    0,    0],
       [ 100,    7,  123, ...,    0,    0,    0],
       ...,
       [ 333,    8,   18, ...,    0,    0,    0],
       [  46,    2,    4, ...,    0,    0,    0],
       [1295,  505,  293, ...,    0,    0,    0]], dtype=int32)

In [32]:
from keras.layers import Dense, Embedding, Flatten
from keras.models import Model, Input
from keras.initializers import Constant

num_words = len(word_2_index)
input = Input(shape = (max_len, ))
emb = Embedding(input_dim=num_words+1,
                    output_dim=300,
                    input_length=max_len)(input)
flat = Flatten()(emb)
output = Dense(3, activation="sigmoid")(flat)

model = Model(input, output)
model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['binary_accuracy'])

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 300)          747600    
_________________________________________________________________
flatten (Flatten)            (None, 30000)             0         
_________________________________________________________________
dense (Dense)                (None, 3)                 90003     
Total params: 837,603
Trainable params: 837,603
Non-trainable params: 0
_________________________________________________________________


In [33]:
from tensorflow.keras.utils import to_categorical

model.fit(X_train_encoded, to_categorical(y_train, num_classes=3), 
          validation_data=(X_dev_encoded, to_categorical(y_dev, num_classes=3)), 
          batch_size=128, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fac8a1e6710>

In [34]:
import numpy as np
y_pred_test = model.predict(X_test_encoded)
y_pred_test = np.argmax(y_pred_test, axis=-1)

In [35]:
from sklearn.metrics import f1_score

print(f1_score(y_test, y_pred_test, average='micro')*100)

89.13455464308277


# Pre-trained Embedding 

In [36]:
pip install pyvi



In [37]:
# Read embedding
word_dict = []
embeddings_index = {}
embedding_dim = 300
max_feature = len(embeddings_index) + 2

f = open(path + '/W2V_ner.vec')
for line in f:
    values = line.split(' ')
    word = values[0] 
    word_dict.append(word)
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except Exception as e:
        pass
f.close()

print('Embedding data loaded')

Embedding data loaded


In [38]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pyvi import ViTokenizer
from keras.preprocessing.text import Tokenizer

max_len = 100

word_tokenizer = Tokenizer(oov_token=-1)
word_tokenizer.fit_on_texts(X_train.values)
word_to_index = word_tokenizer.word_index
word_to_index['pad'] = 0
word_to_index['unk'] = -1

index_to_word = {i: w for w, i in word_to_index.items()}

def encoding(X):
    sentences = []
    
    for t in X:
        tokenized_sentence = ViTokenizer.tokenize(t)
        sentences.append(tokenized_sentence)
    
    X = word_tokenizer.texts_to_sequences(sentences)
    X = pad_sequences(maxlen = max_len, sequences = X, padding = "post", value = word_to_index['pad'])

    return X

In [40]:
# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros((num_words, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in word_to_index.items():
    if i > max_feature:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector
        embedding_matrix[i] = np.random.randn(embedding_dim)

In [41]:
X_train_encoded = encoding(X_train)
X_dev_encoded = encoding(X_dev)
X_test_encoded = encoding(X_test)

In [42]:
from keras.layers import Dense, Embedding, Flatten
from keras.models import Model, Input
from keras.initializers import Constant

num_words = len(word_to_index)

input = Input(shape = (max_len, ))
emb = Embedding(input_dim=num_words,
                    output_dim=embedding_dim,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=max_len,
                    trainable=True)(input)
flat = Flatten()(emb)
output = Dense(3, activation="sigmoid")(flat)

model = Model(input, output)
model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['binary_accuracy'])

model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          747300    
_________________________________________________________________
flatten_1 (Flatten)          (None, 30000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 90003     
Total params: 837,303
Trainable params: 837,303
Non-trainable params: 0
_________________________________________________________________


In [43]:
from tensorflow.keras.utils import to_categorical

model.fit(X_train_encoded, to_categorical(y_train, num_classes=3), 
          validation_data=(X_dev_encoded, to_categorical(y_dev, num_classes=3)), batch_size=128, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fac8a0dcb50>

In [44]:
import numpy as np
y_pred_test = model.predict(X_test_encoded)
y_pred_test = np.argmax(y_pred_test, axis=-1)

In [45]:
from sklearn.metrics import f1_score

print(f1_score(y_test, y_pred_test, average='macro')*100)

73.76726818588334
