## requirement

In [5]:
!pip install py_vncorenlp gensim
!pip install gdown==4.6.0

Collecting gdown==4.6.0
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.6.6
    Uninstalling gdown-4.6.6:
      Successfully uninstalled gdown-4.6.6
Successfully installed gdown-4.6.0


## import

In [6]:
from tqdm import tqdm
import math
import py_vncorenlp
from sklearn.metrics import f1_score, classification_report

## dataset

In [7]:
!mkdir train
!cd train && gdown https://drive.google.com/uc?id=1N41RpBMDfnMyipQUDTXoPvmPdd4ZDPuA&export=download
!cd train && gdown https://drive.google.com/uc?id=16fOhp5N2xUSWCPfthUaSpbhtj8zKlCe0&export=download
!mkdir test
!cd test && gdown https://drive.google.com/uc?id=1jirCj9X_rLSoUFFvQD8RYY5GEC12va4-&export=download
!cd test && gdown https://drive.google.com/uc?id=1RSOhUrtvT0A_DMcuugdpPyY2NwKvSK7C&export=download

mkdir: cannot create directory ‘train’: File exists
Downloading...
From: https://drive.google.com/uc?id=1N41RpBMDfnMyipQUDTXoPvmPdd4ZDPuA
To: /content/train/sents.txt
100% 898k/898k [00:00<00:00, 121MB/s]
Downloading...
From: https://drive.google.com/uc?id=16fOhp5N2xUSWCPfthUaSpbhtj8zKlCe0
To: /content/train/sentiments.txt
100% 22.9k/22.9k [00:00<00:00, 40.3MB/s]
mkdir: cannot create directory ‘test’: File exists
Downloading...
From: https://drive.google.com/uc?id=1jirCj9X_rLSoUFFvQD8RYY5GEC12va4-
To: /content/test/sents.txt
100% 248k/248k [00:00<00:00, 79.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1RSOhUrtvT0A_DMcuugdpPyY2NwKvSK7C
To: /content/test/sentiments.txt
100% 6.33k/6.33k [00:00<00:00, 14.8MB/s]


In [8]:
with open('train/sents.txt') as data, open('train/sentiments.txt') as label:
  traindata = []
  for dataline, labelline in zip(data, label):
    sentence = dataline.strip()
    sentiment = int(labelline.strip())
    if sentiment == 1:
      continue
    traindata.append((sentence, sentiment))
print(len(traindata))
traindata[12]

10968


('có thể cho sinh viên đi thăm quan nhiều công ty xem quy mô và cách làm việc , để giúp hiểu rõ hơn vê ngành mình đang học .',
 0)

In [9]:
with open('test/sents.txt') as data, open('test/sentiments.txt') as label:
  testdata = []
  for dataline, labelline in zip(data, label):
    sentence = dataline.strip()
    sentiment = int(labelline.strip())
    if sentiment == 1:
      continue
    testdata.append((sentence, sentiment))
print(len(testdata))
testdata[79]

2999


('nhiệt tình giúp đỡ giải đáp những thắc mắc của sinh viên .', 2)

## phrase

In [10]:
py_vncorenlp.download_model(save_dir='./')

In [11]:
phraseModel = py_vncorenlp.VnCoreNLP(annotators=['wseg', 'pos'], save_dir='./')

In [12]:
def getPos(sentence):
  annotated = phraseModel.annotate_text(sentence)
  words = [word['wordForm'] for word in annotated[0]]
  tags = [word['posTag'] for word in annotated[0]]
  return words, tags

In [13]:
words, tags = getPos(traindata[999][0])
print(words)
print(tags)

['cập_nhật', 'tài_liệu', 'đầy_đủ', 'cho', 'sinh_viên', '!']
['V', 'N', 'A', 'E', 'N', 'CH']


In [14]:
def getPhrase(words, tags):
  phrases = []
  ptags = []
  for idx, word in enumerate(words):
    phrase = tuple(words[idx:idx + 2])
    ptag = tuple(tags[idx:idx + 2])
    if ptag in [('N', 'A'), ('V', 'A'), ('R', 'A'), ('R', 'V'), ('V', 'R')]:
      phrases.append(phrase)
      ptags.append(ptag)
  return phrases, ptags

In [15]:
phrases, ptags = getPhrase(words, tags)
print(phrases)
print(ptags)

[('tài_liệu', 'đầy_đủ')]
[('N', 'A')]


## pmi

In [None]:
class PMIModel:
  def __init__(self, traindata):
    self.map1 = {}
    self.map2 = {}
    posCount = 0
    globalCount = 0
    for sentence, label in tqdm(traindata):
      if label == 2:
        posCount += 1
      words, tags = getPos(sentence)
      phrases, ptags = getPhrase(words, tags)
      for p in phrases:
        globalCount += 1
        self.map1[p] = self.map1.get(p, 0) + 1
        p2 = (p, label)
        self.map2[p2] = self.map2.get(p2, 0) + 1
    print(len(self.map1))
    for key, val in self.map1.items():
      self.map1[key] = val / globalCount
    for key, val in self.map2.items():
      self.map2[key] = val / globalCount
    self.map1[0] = 1.0 - posCount / len(traindata)
    self.map1[2] = posCount / len(traindata)

  def getProb(self, p):
    return self.map1.get(p, 0.0) + 0.01

  def getProb2(self, p1, p2):
    return self.map2.get((p1, p2), 0.0) + 0.01

  def predict(self, sentence):
    words, tags = getPos(sentence)
    phrases, ptags = getPhrase(words, tags)
    so = 0.0
    for p in phrases:
      sop2 = self.getProb2(p, 2) / (self.getProb(p) * self.getProb(2))
      sop0 = self.getProb2(p, 0) / (self.getProb(p) * self.getProb(0))
      so += math.log2(sop2) - math.log2(sop0)
    return 2 if so >= 0 else 0

  def test(self, dataset):
    hitCount = 0
    yTrue = []
    yPred = []
    for sentence, label in tqdm(dataset):
      predict = self.predict(sentence)
      yTrue.append(label)
      yPred.append(predict)
      if predict == label:
        hitCount += 1
    print(f'{hitCount}/{len(dataset)} ~{hitCount / len(dataset) * 100}')
    f1Score = f1_score(yTrue, yPred, average='weighted')
    print(f'f1 score: {f1Score}')

In [None]:
pmiModel = PMIModel(traindata)
pmiModel.test(testdata)

100%|██████████| 10968/10968 [00:11<00:00, 984.06it/s] 


6155


100%|██████████| 2999/2999 [00:02<00:00, 1202.22it/s]

2163/2999 ~72.1240413471157
f1 score: 0.7176818017124736





## word2vec

In [None]:
from gensim.models import Word2Vec
import torch
import torch.nn.functional as torchF

class W2VModel:
  def __init__(self, traindata):
    self.sentences = []
    for sentence, label in tqdm(traindata):
      words, tags = getPos(sentence)
      self.sentences.append(words)
    self.vectorSize = 100
    self.w2v = Word2Vec(sentences=self.sentences, vector_size=self.vectorSize, window=5, min_count=1, workers=2)
    print()
    print(self.w2v.wv.vectors.shape)

  def embed(self, word):
    if word in self.w2v.wv.key_to_index:
      return torch.Tensor(self.w2v.wv[word])
    else:
      return None

  def embed2(self, phrase):
    res = torch.zeros(self.vectorSize)
    resLen = 0
    for word in phrase:
      wres = self.embed(word)
      if wres is None:
        continue
      res += wres
      resLen += 1
    return res / resLen

  def predict(self, sentence):
    posEp = self.embed('tốt')
    negEp = self.embed('kém')
    words, tags = getPos(sentence)
    phrases, ptags = getPhrase(words, tags)
    so = 0.0
    for p in phrases:
      ep = self.embed2(p)
      posSim = torchF.cosine_similarity(ep, posEp, dim=0)
      negSim = torchF.cosine_similarity(ep, negEp, dim=0)
      so += posSim - negSim
    return 2 if so >= 0 else 0

  def test(self, dataset):
    hitCount = 0
    yTrue = []
    yPred = []
    for sentence, label in tqdm(dataset):
      predict = self.predict(sentence)
      yTrue.append(label)
      yPred.append(predict)
      if predict == label:
        hitCount += 1
    print(f'{hitCount}/{len(dataset)} ~{hitCount / len(dataset) * 100}')
    f1Score = f1_score(yTrue, yPred, average='weighted')
    print(f'f1 score: {f1Score}')

In [None]:
w2vModel = W2VModel(traindata)
w2vModel.test(testdata)

100%|██████████| 10968/10968 [00:09<00:00, 1147.53it/s]



(3568, 100)


  return torch.Tensor(self.w2v.wv[word])
100%|██████████| 2999/2999 [00:03<00:00, 785.49it/s]

2284/2999 ~76.15871957319106
f1 score: 0.76117136121854





## with neutral

In [37]:
with open('train/sents.txt') as data, open('train/sentiments.txt') as label:
  traindata = []
  for dataline, labelline in zip(data, label):
    sentence = dataline.strip()
    sentiment = int(labelline.strip())
    traindata.append((sentence, sentiment))
print(len(traindata))
traindata[12]

11426


('đang dạy thầy wzjwz208 đi qua nước ngoài giữa chừng , thầy wzjwz209 dạy thay .',
 1)

In [38]:
with open('test/sents.txt') as data, open('test/sentiments.txt') as label:
  testdata = []
  for dataline, labelline in zip(data, label):
    sentence = dataline.strip()
    sentiment = int(labelline.strip())
    testdata.append((sentence, sentiment))
print(len(testdata))
testdata[79]

3166


('giảng bài xúc tích .', 2)

In [41]:
class PMIModel:
  def __init__(self, traindata):
    self.map1 = {}
    self.map2 = {}
    posCount = 0
    globalCount = 0
    for sentence, label in tqdm(traindata):
      if label == 2:
        posCount += 1
      words, tags = getPos(sentence)
      phrases, ptags = getPhrase(words, tags)
      for p in phrases:
        globalCount += 1
        self.map1[p] = self.map1.get(p, 0) + 1
        p2 = (p, label)
        self.map2[p2] = self.map2.get(p2, 0) + 1
    print(len(self.map1))
    for key, val in self.map1.items():
      self.map1[key] = val / globalCount
    for key, val in self.map2.items():
      self.map2[key] = val / globalCount
    self.map1[0] = 1.0 - posCount / len(traindata)
    self.map1[2] = posCount / len(traindata)

  def getProb(self, p):
    return self.map1.get(p, 0.0) + 0.01

  def getProb2(self, p1, p2):
    return self.map2.get((p1, p2), 0.0) + 0.01

  def predict(self, sentence):
    words, tags = getPos(sentence)
    phrases, ptags = getPhrase(words, tags)
    so = 0.0
    for p in phrases:
      sop2 = self.getProb2(p, 2) / (self.getProb(p) * self.getProb(2))
      sop0 = self.getProb2(p, 0) / (self.getProb(p) * self.getProb(0))
      so += math.log2(sop2) - math.log2(sop0)
    if abs(so) <= 1e-8:
      return 1
    return 2 if so >= 0 else 0

  def test(self, dataset):
    hitCount = 0
    yTrue = []
    yPred = []
    for sentence, label in tqdm(dataset):
      predict = self.predict(sentence)
      yTrue.append(label)
      yPred.append(predict)
      if predict == label:
        hitCount += 1
    print()
    print(classification_report(yTrue, yPred, digits=4))

In [42]:
pmiModel = PMIModel(traindata)
pmiModel.test(testdata)

100%|██████████| 11426/11426 [00:10<00:00, 1104.13it/s]


6279


100%|██████████| 3166/3166 [00:03<00:00, 904.68it/s] 


              precision    recall  f1-score   support

           0     0.8842    0.4173    0.5670      1409
           1     0.1477    0.4431    0.2216       167
           2     0.6450    0.8113    0.7187      1590

    accuracy                         0.6166      3166
   macro avg     0.5590    0.5573    0.5024      3166
weighted avg     0.7252    0.6166    0.6250      3166






In [45]:
from gensim.models import Word2Vec
import torch
import torch.nn.functional as torchF

class W2VModel:
  def __init__(self, traindata):
    self.sentences = []
    for sentence, label in tqdm(traindata):
      words, tags = getPos(sentence)
      self.sentences.append(words)
    self.vectorSize = 100
    self.w2v = Word2Vec(sentences=self.sentences, vector_size=self.vectorSize, window=5, min_count=1, workers=2)
    print()
    print(self.w2v.wv.vectors.shape)

  def embed(self, word):
    if word in self.w2v.wv.key_to_index:
      return torch.Tensor(self.w2v.wv[word])
    else:
      return None

  def embed2(self, phrase):
    res = torch.zeros(self.vectorSize)
    resLen = 0
    for word in phrase:
      wres = self.embed(word)
      if wres is None:
        continue
      res += wres
      resLen += 1
    return res / resLen if resLen > 0 else res

  def predict(self, sentence, posWord='tốt', negWord='kém'):
    posEp = self.embed(posWord)
    negEp = self.embed(negWord)
    words, tags = getPos(sentence)
    phrases, ptags = getPhrase(words, tags)
    so = 0.0
    for p in phrases:
      ep = self.embed2(p)
      posSim = torchF.cosine_similarity(ep, posEp, dim=0)
      negSim = torchF.cosine_similarity(ep, negEp, dim=0)
      so += posSim - negSim
    if abs(so) <= 1e-8:
      return 1
    return 2 if so >= 0 else 0

  def test(self, dataset, posWord='tốt', negWord='kém'):
    hitCount = 0
    yTrue = []
    yPred = []
    for sentence, label in tqdm(dataset):
      predict = self.predict(sentence, posWord, negWord)
      yTrue.append(label)
      yPred.append(predict)
      if predict == label:
        hitCount += 1
    print(f'{hitCount}/{len(dataset)} ~{hitCount / len(dataset) * 100}')
    f1Score = f1_score(yTrue, yPred, average='weighted')
    print(f'f1 score: {f1Score}')

  def predict2(self, sentence, posList=['tốt'], negList=['kém']):
    posEp = self.embed2(posList)
    negEp = self.embed2(negList)
    words, tags = getPos(sentence)
    phrases, ptags = getPhrase(words, tags)
    so = 0.0
    for p in phrases:
      ep = self.embed2(p)
      posSim = torchF.cosine_similarity(ep, posEp, dim=0)
      negSim = torchF.cosine_similarity(ep, negEp, dim=0)
      so += posSim - negSim
    if abs(so) <= 1e-8:
      return 1
    return 2 if so >= 0 else 0

  def test2(self, dataset, posList=['tốt'], negList=['kém']):
    hitCount = 0
    yTrue = []
    yPred = []
    for sentence, label in tqdm(dataset):
      predict = self.predict2(sentence, posList, negList)
      yTrue.append(label)
      yPred.append(predict)
      if predict == label:
        hitCount += 1
    print()
    print(classification_report(yTrue, yPred, digits=4))

In [46]:
w2vModel = W2VModel(traindata)

100%|██████████| 11426/11426 [00:10<00:00, 1134.41it/s]



(3655, 100)


In [47]:
w2vModel.test2(testdata)

100%|██████████| 3166/3166 [00:03<00:00, 805.48it/s]


              precision    recall  f1-score   support

           0     0.6904    0.8197    0.7495      1409
           1     0.1474    0.4431    0.2212       167
           2     0.9092    0.5667    0.6982      1590

    accuracy                         0.6728      3166
   macro avg     0.5823    0.6098    0.5563      3166
weighted avg     0.7716    0.6728    0.6959      3166






In [48]:
w2vModel.test2(testdata, ['hay'], ['tệ'])

100%|██████████| 3166/3166 [00:04<00:00, 689.62it/s]



              precision    recall  f1-score   support

           0     0.6985    0.7480    0.7224      1409
           1     0.1474    0.4431    0.2212       167
           2     0.8294    0.6025    0.6980      1590

    accuracy                         0.6589      3166
   macro avg     0.5584    0.5979    0.5472      3166
weighted avg     0.7352    0.6589    0.6837      3166



In [49]:
w2vModel.test2(testdata, ['vui'], ['chán'])

100%|██████████| 3166/3166 [00:05<00:00, 567.10it/s]


              precision    recall  f1-score   support

           0     0.7143    0.8020    0.7556      1409
           1     0.1474    0.4431    0.2212       167
           2     0.8928    0.6075    0.7231      1590

    accuracy                         0.6854      3166
   macro avg     0.5848    0.6175    0.5666      3166
weighted avg     0.7740    0.6854    0.7111      3166






In [50]:
w2vModel.test2(testdata, ['dễ'], ['khó'])

100%|██████████| 3166/3166 [00:04<00:00, 689.18it/s]


              precision    recall  f1-score   support

           0     0.8027    0.6295    0.7056      1409
           1     0.1474    0.4431    0.2212       167
           2     0.7569    0.7421    0.7494      1590

    accuracy                         0.6762      3166
   macro avg     0.5690    0.6049    0.5588      3166
weighted avg     0.7451    0.6762    0.7021      3166






In [51]:
w2vModel.test2(testdata, ['tốt', 'hay', 'vui', 'dễ'], ['xấu', 'chán', 'khó', 'tệ'])

100%|██████████| 3166/3166 [00:04<00:00, 771.16it/s]


              precision    recall  f1-score   support

           0     0.7753    0.7225    0.7480      1409
           1     0.1474    0.4431    0.2212       167
           2     0.8238    0.7000    0.7569      1590

    accuracy                         0.6965      3166
   macro avg     0.5822    0.6219    0.5754      3166
weighted avg     0.7666    0.6965    0.7247      3166




