## requirement

In [None]:
!pip install py_vncorenlp gensim



## import

In [None]:
from tqdm import tqdm
import math
import py_vncorenlp
from sklearn.metrics import f1_score

## dataset

In [None]:
!mkdir train
!cd train && gdown https://drive.google.com/uc?id=1nzak5OkrheRV1ltOGCXkT671bmjODLhP&export=download
!cd train && gdown https://drive.google.com/uc?id=1ye-gOZIBqXdKOoi_YxvpT6FeRNmViPPv&export=download
!mkdir test
!cd test && gdown https://drive.google.com/uc?id=1aNMOeZZbNwSRkjyCWAGtNCMa3YrshR-n&export=download
!cd test && gdown https://drive.google.com/uc?id=1vkQS5gI0is4ACU58-AbWusnemw7KZNfO&export=download

mkdir: cannot create directory ‘train’: File exists
Downloading...
From: https://drive.google.com/uc?id=1nzak5OkrheRV1ltOGCXkT671bmjODLhP
To: /content/train/sents.txt
100% 898k/898k [00:00<00:00, 113MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ye-gOZIBqXdKOoi_YxvpT6FeRNmViPPv
To: /content/train/sentiments.txt
100% 22.9k/22.9k [00:00<00:00, 35.8MB/s]
mkdir: cannot create directory ‘test’: File exists
Downloading...
From: https://drive.google.com/uc?id=1aNMOeZZbNwSRkjyCWAGtNCMa3YrshR-n
To: /content/test/sents.txt
100% 248k/248k [00:00<00:00, 80.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1vkQS5gI0is4ACU58-AbWusnemw7KZNfO
To: /content/test/sentiments.txt
100% 6.33k/6.33k [00:00<00:00, 10.8MB/s]


In [None]:
with open('train/sents.txt') as data, open('train/sentiments.txt') as label:
  traindata = []
  for dataline, labelline in zip(data, label):
    sentence = dataline.strip()
    sentiment = int(labelline.strip())
    if sentiment == 1:
      continue
    traindata.append((sentence, sentiment))
print(len(traindata))
traindata[12]

11426


('đang dạy thầy wzjwz208 đi qua nước ngoài giữa chừng , thầy wzjwz209 dạy thay .',
 1)

In [None]:
with open('test/sents.txt') as data, open('test/sentiments.txt') as label:
  testdata = []
  for dataline, labelline in zip(data, label):
    sentence = dataline.strip()
    sentiment = int(labelline.strip())
    if sentiment == 1:
      continue
    testdata.append((sentence, sentiment))
print(len(testdata))
testdata[79]

3166


('giảng bài xúc tích .', 2)

## phrase

In [None]:
py_vncorenlp.download_model(save_dir='./')

VnCoreNLP model folder . already exists! Please load VnCoreNLP from this folder!


In [None]:
phraseModel = py_vncorenlp.VnCoreNLP(annotators=['wseg', 'pos'], save_dir='./')

In [None]:
def getPos(sentence):
  annotated = phraseModel.annotate_text(sentence)
  words = [word['wordForm'] for word in annotated[0]]
  tags = [word['posTag'] for word in annotated[0]]
  return words, tags

In [None]:
words, tags = getPos(traindata[999][0])
print(words)
print(tags)

['cô', 'dạy', 'có', 'tâm', ',', 'post', 'tài_liệu', 'và', 'bài_tập', 'lên', 'moodle', 'rất', 'đầy_đủ', '.']
['N', 'V', 'V', 'N', 'CH', 'N', 'N', 'Cc', 'N', 'V', 'N', 'R', 'A', 'CH']


In [None]:
def getPhrase(words, tags):
  phrases = []
  ptags = []
  for idx, word in enumerate(words):
    phrase = tuple(words[idx:idx + 2])
    ptag = tuple(tags[idx:idx + 2])
    if ptag in [('N', 'A'), ('V', 'A'), ('R', 'A'), ('R', 'V'), ('V', 'R')]:
      phrases.append(phrase)
      ptags.append(ptag)
  return phrases, ptags

In [None]:
phrases, ptags = getPhrase(words, tags)
print(phrases)
print(ptags)

[('rất', 'đầy_đủ')]
[('R', 'A')]


## pmi

In [None]:
class PMIModel:
  def __init__(self, traindata):
    self.map1 = {}
    self.map2 = {}
    posCount = 0
    globalCount = 0
    for sentence, label in tqdm(traindata):
      if label == 2:
        posCount += 1
      words, tags = getPos(sentence)
      phrases, ptags = getPhrase(words, tags)
      for p in phrases:
        globalCount += 1
        self.map1[p] = self.map1.get(p, 0) + 1
        p2 = (p, label)
        self.map2[p2] = self.map2.get(p2, 0) + 1
    print(len(self.map1))
    for key, val in self.map1.items():
      self.map1[key] = val / globalCount
    for key, val in self.map2.items():
      self.map2[key] = val / globalCount
    self.map1[0] = 1.0 - posCount / len(traindata)
    self.map1[2] = posCount / len(traindata)

  def getProb(self, p):
    return self.map1.get(p, 0.0) + 0.01

  def getProb2(self, p1, p2):
    return self.map2.get((p1, p2), 0.0) + 0.01

  def predict(self, sentence):
    words, tags = getPos(sentence)
    phrases, ptags = getPhrase(words, tags)
    so = 0.0
    for p in phrases:
      sop2 = self.getProb2(p, 2) / (self.getProb(p) * self.getProb(2))
      sop0 = self.getProb2(p, 0) / (self.getProb(p) * self.getProb(0))
      so += math.log2(sop2) - math.log2(sop0)
    return 2 if so >= 0 else 0

  def test(self, dataset):
    hitCount = 0
    yTrue = []
    yPred = []
    for sentence, label in tqdm(dataset):
      predict = self.predict(sentence)
      yTrue.append(label)
      yPred.append(predict)
      if predict == label:
        hitCount += 1
    print(f'{hitCount}/{len(dataset)} ~{hitCount / len(dataset) * 100}')
    f1Score = f1_score(yTrue, yPred, average='weighted')
    print(f'f1 score: {f1Score}')

In [None]:
pmiModel = PMIModel(traindata)
pmiModel.test(testdata)

100%|██████████| 10968/10968 [00:11<00:00, 984.06it/s] 


6155


100%|██████████| 2999/2999 [00:02<00:00, 1202.22it/s]

2163/2999 ~72.1240413471157
f1 score: 0.7176818017124736





## word2vec

In [None]:
from gensim.models import Word2Vec
import torch
import torch.nn.functional as torchF

class W2VModel:
  def __init__(self, traindata):
    self.sentences = []
    for sentence, label in tqdm(traindata):
      words, tags = getPos(sentence)
      self.sentences.append(words)
    self.vectorSize = 100
    self.w2v = Word2Vec(sentences=self.sentences, vector_size=self.vectorSize, window=5, min_count=1, workers=2)
    print()
    print(self.w2v.wv.vectors.shape)

  def embed(self, word):
    if word in self.w2v.wv.key_to_index:
      return torch.Tensor(self.w2v.wv[word])
    else:
      return None

  def embed2(self, phrase):
    res = torch.zeros(self.vectorSize)
    resLen = 0
    for word in phrase:
      wres = self.embed(word)
      if wres is None:
        continue
      res += wres
      resLen += 1
    return res / resLen

  def predict(self, sentence):
    posEp = self.embed('tốt')
    negEp = self.embed('kém')
    words, tags = getPos(sentence)
    phrases, ptags = getPhrase(words, tags)
    so = 0.0
    for p in phrases:
      ep = self.embed2(p)
      posSim = torchF.cosine_similarity(ep, posEp, dim=0)
      negSim = torchF.cosine_similarity(ep, negEp, dim=0)
      so += posSim - negSim
    return 2 if so >= 0 else 0

  def test(self, dataset):
    hitCount = 0
    yTrue = []
    yPred = []
    for sentence, label in tqdm(dataset):
      predict = self.predict(sentence)
      yTrue.append(label)
      yPred.append(predict)
      if predict == label:
        hitCount += 1
    print(f'{hitCount}/{len(dataset)} ~{hitCount / len(dataset) * 100}')
    f1Score = f1_score(yTrue, yPred, average='weighted')
    print(f'f1 score: {f1Score}')

In [None]:
w2vModel = W2VModel(traindata)
w2vModel.test(testdata)

100%|██████████| 10968/10968 [00:09<00:00, 1147.53it/s]



(3568, 100)


  return torch.Tensor(self.w2v.wv[word])
100%|██████████| 2999/2999 [00:03<00:00, 785.49it/s]

2284/2999 ~76.15871957319106
f1 score: 0.76117136121854





## with neutral

In [None]:
with open('train/sents.txt') as data, open('train/sentiments.txt') as label:
  traindata = []
  for dataline, labelline in zip(data, label):
    sentence = dataline.strip()
    sentiment = int(labelline.strip())
    traindata.append((sentence, sentiment))
print(len(traindata))
traindata[12]

In [None]:
with open('test/sents.txt') as data, open('test/sentiments.txt') as label:
  testdata = []
  for dataline, labelline in zip(data, label):
    sentence = dataline.strip()
    sentiment = int(labelline.strip())
    testdata.append((sentence, sentiment))
print(len(testdata))
testdata[79]

In [None]:
class PMIModel:
  def __init__(self, traindata):
    self.map1 = {}
    self.map2 = {}
    posCount = 0
    globalCount = 0
    for sentence, label in tqdm(traindata):
      if label == 2:
        posCount += 1
      words, tags = getPos(sentence)
      phrases, ptags = getPhrase(words, tags)
      for p in phrases:
        globalCount += 1
        self.map1[p] = self.map1.get(p, 0) + 1
        p2 = (p, label)
        self.map2[p2] = self.map2.get(p2, 0) + 1
    print(len(self.map1))
    for key, val in self.map1.items():
      self.map1[key] = val / globalCount
    for key, val in self.map2.items():
      self.map2[key] = val / globalCount
    self.map1[0] = 1.0 - posCount / len(traindata)
    self.map1[2] = posCount / len(traindata)

  def getProb(self, p):
    return self.map1.get(p, 0.0) + 0.01

  def getProb2(self, p1, p2):
    return self.map2.get((p1, p2), 0.0) + 0.01

  def predict(self, sentence):
    words, tags = getPos(sentence)
    phrases, ptags = getPhrase(words, tags)
    so = 0.0
    for p in phrases:
      sop2 = self.getProb2(p, 2) / (self.getProb(p) * self.getProb(2))
      sop0 = self.getProb2(p, 0) / (self.getProb(p) * self.getProb(0))
      so += math.log2(sop2) - math.log2(sop0)
    if abs(so) <= 1e-8:
      return 1
    return 2 if so >= 0 else 0

  def test(self, dataset):
    hitCount = 0
    yTrue = []
    yPred = []
    for sentence, label in tqdm(dataset):
      predict = self.predict(sentence)
      yTrue.append(label)
      yPred.append(predict)
      if predict == label:
        hitCount += 1
    print(f'{hitCount}/{len(dataset)} ~{hitCount / len(dataset) * 100}')
    f1Score = f1_score(yTrue, yPred, average='weighted')
    print(f'f1 score: {f1Score}')

In [None]:
pmiModel = PMIModel(traindata)
pmiModel.test(testdata)

100%|██████████| 11426/11426 [00:11<00:00, 978.03it/s] 


6279


100%|██████████| 3166/3166 [00:02<00:00, 1087.24it/s]

1952/3166 ~61.65508528111181
f1 score: 0.6249543992345439





In [None]:
from gensim.models import Word2Vec
import torch
import torch.nn.functional as torchF

class W2VModel:
  def __init__(self, traindata):
    self.sentences = []
    for sentence, label in tqdm(traindata):
      words, tags = getPos(sentence)
      self.sentences.append(words)
    self.vectorSize = 100
    self.w2v = Word2Vec(sentences=self.sentences, vector_size=self.vectorSize, window=5, min_count=1, workers=2)
    print()
    print(self.w2v.wv.vectors.shape)

  def embed(self, word):
    if word in self.w2v.wv.key_to_index:
      return torch.Tensor(self.w2v.wv[word])
    else:
      return None

  def embed2(self, phrase):
    res = torch.zeros(self.vectorSize)
    resLen = 0
    for word in phrase:
      wres = self.embed(word)
      if wres is None:
        continue
      res += wres
      resLen += 1
    return res / resLen

  def predict(self, sentence):
    posEp = self.embed('tốt')
    negEp = self.embed('kém')
    words, tags = getPos(sentence)
    phrases, ptags = getPhrase(words, tags)
    so = 0.0
    for p in phrases:
      ep = self.embed2(p)
      posSim = torchF.cosine_similarity(ep, posEp, dim=0)
      negSim = torchF.cosine_similarity(ep, negEp, dim=0)
      so += posSim - negSim
    if abs(so) <= 1e-8:
      return 1
    return 2 if so >= 0 else 0

  def test(self, dataset):
    hitCount = 0
    yTrue = []
    yPred = []
    for sentence, label in tqdm(dataset):
      predict = self.predict(sentence)
      yTrue.append(label)
      yPred.append(predict)
      if predict == label:
        hitCount += 1
    print(f'{hitCount}/{len(dataset)} ~{hitCount / len(dataset) * 100}')
    f1Score = f1_score(yTrue, yPred, average='weighted')
    print(f'f1 score: {f1Score}')

In [None]:
w2vModel = W2VModel(traindata)
w2vModel.test(testdata)

100%|██████████| 11426/11426 [00:11<00:00, 955.88it/s] 



(3655, 100)


100%|██████████| 3166/3166 [00:05<00:00, 613.50it/s]

2133/3166 ~67.37207833228048
f1 score: 0.6968596778073534



