## requirement

In [None]:
!pip install py_vncorenlp gensim

Collecting py_vncorenlp
  Downloading py_vncorenlp-0.1.4.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius (from py_vncorenlp)
  Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: py_vncorenlp
  Building wheel for py_vncorenlp (setup.py) ... [?25l[?25hdone
  Created wheel for py_vncorenlp: filename=py_vncorenlp-0.1.4-py3-none-any.whl size=4307 sha256=7b97e970488728b063a896bdb004435e36b1bb855100a06529da819267b4df41
  Stored in directory: /root/.cache/pip/wheels/d5/d9/bf/62632cdb007c702a0664091e92a0bb1f18a2fcecbe962d9827
Successfully built py_vncorenlp
Installing collected packages: pyjnius, py_vncorenlp
Successfully installed py_vncorenlp-0.1.4 pyjnius-1.6.1


## import

In [None]:
from tqdm import tqdm
import math
import py_vncorenlp
from sklearn.metrics import f1_score

## dataset

In [None]:
!pip install gdown==4.6.0

Collecting gdown==4.6.0
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.7.0
    Uninstalling gdown-4.7.0:
      Successfully uninstalled gdown-4.7.0
Successfully installed gdown-4.6.0


In [None]:
!mkdir train
!cd train && gdown https://drive.google.com/uc?id=1N41RpBMDfnMyipQUDTXoPvmPdd4ZDPuA&export=download
!cd train && gdown https://drive.google.com/uc?id=16fOhp5N2xUSWCPfthUaSpbhtj8zKlCe0&export=download
!mkdir test
!cd test && gdown https://drive.google.com/uc?id=1jirCj9X_rLSoUFFvQD8RYY5GEC12va4-&export=download
!cd test && gdown https://drive.google.com/uc?id=1RSOhUrtvT0A_DMcuugdpPyY2NwKvSK7C&export=download

mkdir: cannot create directory ‘train’: File exists
Downloading...
From: https://drive.google.com/uc?id=1N41RpBMDfnMyipQUDTXoPvmPdd4ZDPuA
To: /content/train/sents.txt
100% 898k/898k [00:00<00:00, 42.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=16fOhp5N2xUSWCPfthUaSpbhtj8zKlCe0
To: /content/train/sentiments.txt
100% 22.9k/22.9k [00:00<00:00, 51.6MB/s]
mkdir: cannot create directory ‘test’: File exists
Downloading...
From: https://drive.google.com/uc?id=1jirCj9X_rLSoUFFvQD8RYY5GEC12va4-
To: /content/test/sents.txt
100% 248k/248k [00:00<00:00, 92.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1RSOhUrtvT0A_DMcuugdpPyY2NwKvSK7C
To: /content/test/sentiments.txt
100% 6.33k/6.33k [00:00<00:00, 14.9MB/s]


In [None]:
with open('train/sents.txt') as data, open('train/sentiments.txt') as label:
  traindata = []
  for dataline, labelline in zip(data, label):
    sentence = dataline.strip()
    sentiment = int(labelline.strip())
    if sentiment == 1:
      continue
    traindata.append((sentence, sentiment))
print(len(traindata))
traindata[12]

10968


('có thể cho sinh viên đi thăm quan nhiều công ty xem quy mô và cách làm việc , để giúp hiểu rõ hơn vê ngành mình đang học .',
 0)

In [None]:
with open('test/sents.txt') as data, open('test/sentiments.txt') as label:
  testdata = []
  for dataline, labelline in zip(data, label):
    sentence = dataline.strip()
    sentiment = int(labelline.strip())
    if sentiment == 1:
      continue
    testdata.append((sentence, sentiment))
print(len(testdata))
testdata[79]

2999


('nhiệt tình giúp đỡ giải đáp những thắc mắc của sinh viên .', 2)

## phrase

In [None]:
py_vncorenlp.download_model(save_dir='./')

In [None]:
phraseModel = py_vncorenlp.VnCoreNLP(annotators=['wseg', 'pos'], save_dir='./')

In [None]:
def getPos(sentence):
  annotated = phraseModel.annotate_text(sentence)
  words = [word['wordForm'] for word in annotated[0]]
  tags = [word['posTag'] for word in annotated[0]]
  return words, tags

In [None]:
words, tags = getPos(traindata[999][0])
print(words)
print(tags)

['cập_nhật', 'tài_liệu', 'đầy_đủ', 'cho', 'sinh_viên', '!']
['V', 'N', 'A', 'E', 'N', 'CH']


In [None]:
def getPhrase(words, tags):
  phrases = []
  ptags = []
  for idx, word in enumerate(words):
    phrase = tuple(words[idx:idx + 2])
    ptag = tuple(tags[idx:idx + 2])
    if ptag in [('N', 'A'), ('V', 'A'), ('R', 'A'), ('R', 'V'), ('V', 'R')]:
      phrases.append(phrase)
      ptags.append(ptag)
  return phrases, ptags

In [None]:
phrases, ptags = getPhrase(words, tags)
print(phrases)
print(ptags)

[('tài_liệu', 'đầy_đủ')]
[('N', 'A')]


## pmi

In [None]:
class PMIModel:
  def __init__(self, traindata):
    self.map1 = {}
    self.map2 = {}
    posCount = 0
    globalCount = 0
    for sentence, label in tqdm(traindata):
      if label == 2:
        posCount += 1
      words, tags = getPos(sentence)
      phrases, ptags = getPhrase(words, tags)
      for p in phrases:
        globalCount += 1
        self.map1[p] = self.map1.get(p, 0) + 1
        p2 = (p, label)
        self.map2[p2] = self.map2.get(p2, 0) + 1
    print(len(self.map1))
    for key, val in self.map1.items():
      self.map1[key] = val / globalCount
    for key, val in self.map2.items():
      self.map2[key] = val / globalCount
    self.map1[0] = 1.0 - posCount / len(traindata)
    self.map1[2] = posCount / len(traindata)

  def getProb(self, p):
    return self.map1.get(p, 0.0) + 0.01

  def getProb2(self, p1, p2):
    return self.map2.get((p1, p2), 0.0) + 0.01

  def predict(self, sentence):
    words, tags = getPos(sentence)
    phrases, ptags = getPhrase(words, tags)
    so = 0.0
    for p in phrases:
      sop2 = self.getProb2(p, 2) / (self.getProb(p) * self.getProb(2))
      sop0 = self.getProb2(p, 0) / (self.getProb(p) * self.getProb(0))
      so += math.log2(sop2) - math.log2(sop0)
    return 2 if so >= 0 else 0

  def test(self, dataset):
    hitCount = 0
    yTrue = []
    yPred = []
    for sentence, label in tqdm(dataset):
      predict = self.predict(sentence)
      yTrue.append(label)
      yPred.append(predict)
      if predict == label:
        hitCount += 1
    print(f'{hitCount}/{len(dataset)} ~{hitCount / len(dataset) * 100}')
    f1Score = f1_score(yTrue, yPred, average='weighted')
    print(f'f1 score: {f1Score}')

In [None]:
pmiModel = PMIModel(traindata)
pmiModel.test(testdata)

100%|██████████| 10968/10968 [00:11<00:00, 984.06it/s] 


6155


100%|██████████| 2999/2999 [00:02<00:00, 1202.22it/s]

2163/2999 ~72.1240413471157
f1 score: 0.7176818017124736





## word2vec

In [None]:
from gensim.models import Word2Vec
import torch
import torch.nn.functional as torchF

class W2VModel:
  def __init__(self, traindata):
    self.sentences = []
    for sentence, label in tqdm(traindata):
      words, tags = getPos(sentence)
      self.sentences.append(words)
    self.vectorSize = 100
    self.w2v = Word2Vec(sentences=self.sentences, vector_size=self.vectorSize, window=5, min_count=1, workers=2)
    print()
    print(self.w2v.wv.vectors.shape)

  def embed(self, word):
    if word in self.w2v.wv.key_to_index:
      return torch.Tensor(self.w2v.wv[word])
    else:
      return None

  def embed2(self, phrase):
    res = torch.zeros(self.vectorSize)
    resLen = 0
    for word in phrase:
      wres = self.embed(word)
      if wres is None:
        continue
      res += wres
      resLen += 1
    return res / resLen

  def predict(self, sentence):
    posEp = self.embed('tốt')
    negEp = self.embed('kém')
    words, tags = getPos(sentence)
    phrases, ptags = getPhrase(words, tags)
    so = 0.0
    for p in phrases:
      ep = self.embed2(p)
      posSim = torchF.cosine_similarity(ep, posEp, dim=0)
      negSim = torchF.cosine_similarity(ep, negEp, dim=0)
      so += posSim - negSim
    return 2 if so >= 0 else 0

  def test(self, dataset):
    hitCount = 0
    yTrue = []
    yPred = []
    for sentence, label in tqdm(dataset):
      predict = self.predict(sentence)
      yTrue.append(label)
      yPred.append(predict)
      if predict == label:
        hitCount += 1
    print(f'{hitCount}/{len(dataset)} ~{hitCount / len(dataset) * 100}')
    f1Score = f1_score(yTrue, yPred, average='weighted')
    print(f'f1 score: {f1Score}')

In [None]:
w2vModel = W2VModel(traindata)
w2vModel.test(testdata)

100%|██████████| 10968/10968 [00:09<00:00, 1147.53it/s]



(3568, 100)


  return torch.Tensor(self.w2v.wv[word])
100%|██████████| 2999/2999 [00:03<00:00, 785.49it/s]

2284/2999 ~76.15871957319106
f1 score: 0.76117136121854





## with neutral

In [None]:
with open('train/sents.txt') as data, open('train/sentiments.txt') as label:
  traindata = []
  for dataline, labelline in zip(data, label):
    sentence = dataline.strip()
    sentiment = int(labelline.strip())
    traindata.append((sentence, sentiment))
print(len(traindata))
traindata[12]

11426


('đang dạy thầy wzjwz208 đi qua nước ngoài giữa chừng , thầy wzjwz209 dạy thay .',
 1)

In [None]:
with open('test/sents.txt') as data, open('test/sentiments.txt') as label:
  testdata = []
  for dataline, labelline in zip(data, label):
    sentence = dataline.strip()
    sentiment = int(labelline.strip())
    testdata.append((sentence, sentiment))
print(len(testdata))
testdata[79]

3166


('giảng bài xúc tích .', 2)

In [None]:
class PMIModel:
  def __init__(self, traindata):
    self.map1 = {}
    self.map2 = {}
    posCount = 0
    globalCount = 0
    for sentence, label in tqdm(traindata):
      if label == 2:
        posCount += 1
      words, tags = getPos(sentence)
      phrases, ptags = getPhrase(words, tags)
      for p in phrases:
        globalCount += 1
        self.map1[p] = self.map1.get(p, 0) + 1
        p2 = (p, label)
        self.map2[p2] = self.map2.get(p2, 0) + 1
    print(len(self.map1))
    for key, val in self.map1.items():
      self.map1[key] = val / globalCount
    for key, val in self.map2.items():
      self.map2[key] = val / globalCount
    self.map1[0] = 1.0 - posCount / len(traindata)
    self.map1[2] = posCount / len(traindata)

  def getProb(self, p):
    return self.map1.get(p, 0.0) + 0.01

  def getProb2(self, p1, p2):
    return self.map2.get((p1, p2), 0.0) + 0.01

  def predict(self, sentence):
    words, tags = getPos(sentence)
    phrases, ptags = getPhrase(words, tags)
    so = 0.0
    for p in phrases:
      sop2 = self.getProb2(p, 2) / (self.getProb(p) * self.getProb(2))
      sop0 = self.getProb2(p, 0) / (self.getProb(p) * self.getProb(0))
      so += math.log2(sop2) - math.log2(sop0)
    if abs(so) <= 1e-8:
      return 1
    return 2 if so >= 0 else 0

  def test(self, dataset):
    hitCount = 0
    yTrue = []
    yPred = []
    for sentence, label in tqdm(dataset):
      predict = self.predict(sentence)
      yTrue.append(label)
      yPred.append(predict)
      if predict == label:
        hitCount += 1
    print(f'{hitCount}/{len(dataset)} ~{hitCount / len(dataset) * 100}')
    f1Score = f1_score(yTrue, yPred, average='weighted')
    print(f'f1 score: {f1Score}')

In [None]:
pmiModel = PMIModel(traindata)
pmiModel.test(testdata)

100%|██████████| 11426/11426 [00:11<00:00, 978.03it/s] 


6279


100%|██████████| 3166/3166 [00:02<00:00, 1087.24it/s]

1952/3166 ~61.65508528111181
f1 score: 0.6249543992345439





In [None]:
from gensim.models import Word2Vec
import torch
import torch.nn.functional as torchF

class W2VModel:
  def __init__(self, traindata):
    self.sentences = []
    for sentence, label in tqdm(traindata):
      words, tags = getPos(sentence)
      self.sentences.append(words)
    self.vectorSize = 100
    self.w2v = Word2Vec(sentences=self.sentences, vector_size=self.vectorSize, window=5, min_count=1, workers=2)
    print()
    print(self.w2v.wv.vectors.shape)

  def embed(self, word):
    if word in self.w2v.wv.key_to_index:
      return torch.Tensor(self.w2v.wv[word])
    else:
      return None

  def embed2(self, phrase):
    res = torch.zeros(self.vectorSize)
    resLen = 0
    for word in phrase:
      wres = self.embed(word)
      if wres is None:
        continue
      res += wres
      resLen += 1
    return res / resLen if resLen > 0 else res

  def predict(self, sentence, posWord='tốt', negWord='kém'):
    posEp = self.embed(posWord)
    negEp = self.embed(negWord)
    words, tags = getPos(sentence)
    phrases, ptags = getPhrase(words, tags)
    so = 0.0
    for p in phrases:
      ep = self.embed2(p)
      posSim = torchF.cosine_similarity(ep, posEp, dim=0)
      negSim = torchF.cosine_similarity(ep, negEp, dim=0)
      so += posSim - negSim
    if abs(so) <= 1e-8:
      return 1
    return 2 if so >= 0 else 0

  def test(self, dataset, posWord='tốt', negWord='kém'):
    hitCount = 0
    yTrue = []
    yPred = []
    for sentence, label in tqdm(dataset):
      predict = self.predict(sentence, posWord, negWord)
      yTrue.append(label)
      yPred.append(predict)
      if predict == label:
        hitCount += 1
    print(f'{hitCount}/{len(dataset)} ~{hitCount / len(dataset) * 100}')
    f1Score = f1_score(yTrue, yPred, average='weighted')
    print(f'f1 score: {f1Score}')

  def predict2(self, sentence, posList=['tốt'], negList=['kém']):
    posEp = self.embed2(posList)
    negEp = self.embed2(negList)
    words, tags = getPos(sentence)
    phrases, ptags = getPhrase(words, tags)
    so = 0.0
    for p in phrases:
      ep = self.embed2(p)
      posSim = torchF.cosine_similarity(ep, posEp, dim=0)
      negSim = torchF.cosine_similarity(ep, negEp, dim=0)
      so += posSim - negSim
    if abs(so) <= 1e-8:
      return 1
    return 2 if so >= 0 else 0

  def test2(self, dataset, posList=['tốt'], negList=['kém']):
    hitCount = 0
    yTrue = []
    yPred = []
    for sentence, label in tqdm(dataset):
      predict = self.predict2(sentence, posList, negList)
      yTrue.append(label)
      yPred.append(predict)
      if predict == label:
        hitCount += 1
    print(f'{hitCount}/{len(dataset)} ~{hitCount / len(dataset) * 100}')
    f1Score = f1_score(yTrue, yPred, average='weighted')
    print(f'f1 score: {f1Score}')

In [None]:
w2vModel = W2VModel(traindata)

100%|██████████| 11426/11426 [00:17<00:00, 664.33it/s]



(3655, 100)


In [None]:
w2vModel.test2(testdata)

100%|██████████| 3166/3166 [00:06<00:00, 515.78it/s]

2131/3166 ~67.30890713834492
f1 score: 0.696228041215839





In [None]:
w2vModel.test2(testdata, ['hay'], ['tệ'])

100%|██████████| 3166/3166 [00:05<00:00, 536.47it/s]

2090/3166 ~66.01389766266583
f1 score: 0.6851564698764465





In [None]:
w2vModel.test2(testdata, ['vui'], ['chán'])

100%|██████████| 3166/3166 [00:09<00:00, 323.07it/s]

2170/3166 ~68.54074542008844
f1 score: 0.711130145033557





In [None]:
w2vModel.test2(testdata, ['dễ'], ['khó'])

100%|██████████| 3166/3166 [00:08<00:00, 388.71it/s]

2142/3166 ~67.65634870499052
f1 score: 0.7025567395284235





In [None]:
w2vModel.test2(testdata, ['tốt', 'hay', 'vui', 'dễ'], ['xấu', 'chán', 'khó', 'tệ'])

100%|██████████| 3166/3166 [00:04<00:00, 656.20it/s]


2204/3166 ~69.61465571699304
f1 score: 0.7243333938062507
