In [None]:
import numpy as np
import pandas as pd

In [None]:
!pip install datasets
from datasets import load_dataset
dataset_train = load_dataset('imdb', split='train')
dataset_test = load_dataset('imdb', split='test')

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a...


In [None]:
import spacy

# loading the small English model
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "lemmatizer", "ner"])

In [None]:
x_train, y_train, x_test, y_test = dataset_train[:]['text'], dataset_train[:]['label'], dataset_test[:]['text'], dataset_test[:]['label']
len(x_train)

In [None]:
from tqdm import tqdm
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import re
nltk.download('punkt')

def stem(l):
  res = []
  re_word = re.compile(r"^\w+$")
  stemmer = SnowballStemmer("english")
  for text in tqdm(l, total=len(l)):
    res.append(" ".join([stemmer.stem(word) for word in word_tokenize(text.lower()) if re_word.match(word)]))
  return res

In [None]:
stemmed_train = stem(x_train)
stemmed_test = stem(x_test)

In [None]:
def lemm(l):
  lemmas = []
  re_word = re.compile(r"^\w+$")
  for text in tqdm(l, total=len(l)):
    lemmas.append(' '.join([token.lemma_ for token in nlp(text.lower()) if re_word.match(token.text)]))
  return lemmas

In [None]:
lemmas_train = lemm(x_train)
lemmas_test = lemm(x_test)

In [None]:
!wget https://raw.githubusercontent.com/cjhutto/vaderSentiment/master/vaderSentiment/vader_lexicon.txt
lexicon = pd.read_csv("vader_lexicon.txt", sep="\t", names=['word', 'MEAN-SENTIMENT-RATING', 'a', 'b']).drop(['a', 'b'], axis = 'columns')
d = {}
for w, v in lexicon.iterrows():
    d[v[0]] = v[1]

--2021-10-03 12:44:58--  https://raw.githubusercontent.com/cjhutto/vaderSentiment/master/vaderSentiment/vader_lexicon.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 426786 (417K) [text/plain]
Saving to: ‘vader_lexicon.txt.1’


2021-10-03 12:44:58 (11.9 MB/s) - ‘vader_lexicon.txt.1’ saved [426786/426786]



In [None]:
def occurences_and_vocabulary(x_train, y_train, classes):
  dictionnary = {}
  for c in classes:
    dictionnary[c] = {}
  c = -1
  vocabulary = []
  for i in range(len(y_train)):
    c = y_train[i]
    splitted_doc = re.split("[ .,\"]", x_train[i])
    for word in splitted_doc:
      vocabulary.append(word)
      if word not in dictionnary[c]:
        dictionnary[c][word] = 1
      else:
        dictionnary[c][word] += 1
  return dictionnary, vocabulary

# dictionnary, vocabulary = occurences_and_vocabulary(lemmas_train, y_train, [0, 1])

In [None]:
def sum_counts(D):
    dicts = D.values()
    iterator = iter(dicts)
    sum_pos = sum(next(iterator).values())
    sum_neg = sum(next(iterator).values())
    return sum_pos, sum_neg

In [None]:
def train_naive_bayes(D, C):
    logprior = dict()
    bigdoc = dict()
    (data, target) = D
    count, vocabulary = occurences_and_vocabulary(data, target, C)
    ndoc = len(target)
    loglikelihood = dict()
    sum_pos, sum_neg = sum_counts(count)
    for c in C:
#       nc = np.count_nonzero(target)
      nc = 12500 # for testing
      logprior[c] = np.log(nc/ndoc)
      for w in vocabulary:
          if not c in loglikelihood:
              loglikelihood[c] = {}
              # somme des counts de w dans V + 1
          if w not in count[c]:
              loglikelihood[c][w] = 0
              continue
          if c == 0:
              loglikelihood[c][w] = np.log((count[c][w] + 1) / (sum_pos + 1))
          else:
              loglikelihood[c][w] = np.log((count[c][w] + 1) / (sum_neg + 1))
    return logprior, loglikelihood, vocabulary


logprior, loglikelihood, vocabulary = train_naive_bayes2((lemmas_train, y_train), [0, 1])

0
12500
25000
1
12500
25000


In [None]:
def test_naive_bayes(testdoc, logprior, loglikelihood, C, V):
    sum_ = [0, 0]
    for c in C:
        sum_[c] = logprior[c]
        for word in testdoc:
            if word in V:
                sum_[c] += loglikelihood[c][word]
    return np.argmax(sum_)

In [None]:
accuracy = 0
for i in range(len(x_test)):
    print(i)
    var = test_naive_bayes(lemmas_test[i], logprior, loglikelihood, [0, 1], vocabulary)
    if var == y_test[i]:
        accuracy += 1
accuracy /= len(x_test)

0
[-5585.247989409187, -5678.753159740923]
1
[-13958.217882890132, -14216.797873996173]
2
[-10021.002107287055, -10182.291490542606]
3
[-4677.760572049008, -4763.57465068901]
4
[-8651.044405629782, -8803.517715673326]
5
[-12409.14639399683, -12640.862393161175]
6


KeyboardInterrupt: ignored

In [None]:
accuracy