In [None]:
import numpy as np
import pandas as pd

In [None]:
!pip install datasets
from datasets import load_dataset
dataset_train = load_dataset('imdb', split='train')
dataset_test = load_dataset('imdb', split='test')



Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)
Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


In [None]:
import spacy

# loading the small English model
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "lemmatizer", "ner"])

In [None]:
x_train, y_train, x_test, y_test = dataset_train[:]['text'], dataset_train[:]['label'], dataset_test[:]['text'], dataset_test[:]['label']
len(x_train)

25000

In [None]:
from tqdm import tqdm
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import re
nltk.download('punkt')

def stem(l):
  res = []
  re_word = re.compile(r"^\w+$")
  stemmer = SnowballStemmer("english")
  for text in tqdm(l, total=len(l)):
    res.append(" ".join([stemmer.stem(word) for word in word_tokenize(text.lower()) if re_word.match(word)]))
  return res

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
stemmed_train = stem(x_train)
stemmed_test = stem(x_test)

100%|██████████| 25000/25000 [02:02<00:00, 203.93it/s]
100%|██████████| 25000/25000 [01:59<00:00, 208.71it/s]


In [None]:
def lemm(l):
  lemmas = []
  re_word = re.compile(r"^\w+$")
  for text in tqdm(l, total=len(l)):
    lemmas.append(' '.join([token.lemma_ for token in nlp(text.lower()) if re_word.match(token.text)]))
  return lemmas

In [None]:
lemmas_train = lemm(x_train)
lemmas_test = lemm(x_test)

100%|██████████| 25000/25000 [00:59<00:00, 419.19it/s]
100%|██████████| 25000/25000 [00:58<00:00, 428.44it/s]


In [None]:
!wget https://raw.githubusercontent.com/cjhutto/vaderSentiment/master/vaderSentiment/vader_lexicon.txt
lexicon = pd.read_csv("vader_lexicon.txt", sep="\t", names=['word', 'MEAN-SENTIMENT-RATING', 'a', 'b']).drop(['a', 'b'], axis = 'columns')
d = {}
for w, v in lexicon.iterrows():
    d[v[0]] = v[1]

--2021-10-04 08:00:03--  https://raw.githubusercontent.com/cjhutto/vaderSentiment/master/vaderSentiment/vader_lexicon.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 426786 (417K) [text/plain]
Saving to: ‘vader_lexicon.txt.2’


2021-10-04 08:00:03 (11.2 MB/s) - ‘vader_lexicon.txt.2’ saved [426786/426786]



In [None]:
df_train_lemma = pd.DataFrame(list(zip(lemmas_train, y_train)), columns=['val', 'label'])
df_test_lemma = pd.DataFrame(list(zip(lemmas_test, y_test)), columns=['val', 'label'])

df_train_stem = pd.DataFrame(list(zip(stemmed_train, y_train)), columns=['val', 'label'])
df_test_stem = pd.DataFrame(list(zip(stemmed_test, y_test)), columns=['val', 'label'])

df_train = pd.DataFrame(list(zip(x_train, y_train)), columns=['val', 'label'])
df_test = pd.DataFrame(list(zip(x_test, y_test)), columns=['val', 'label'])
df_train.head(3), df_test.head(3)

(                                                 val  label
 0  Bromwell High is a cartoon comedy. It ran at t...      1
 1  Homelessness (or Houselessness as George Carli...      1
 2  Brilliant over-acting by Lesley Ann Warren. Be...      1,
                                                  val  label
 0  I went and saw this movie last night after bei...      1
 1  Actor turned director Bill Paxton follows up h...      1
 2  As a recreational golfer with some knowledge o...      1)

In [None]:
get_features(df_train)
get_features(df_test)

get_features(df_train_lemma)
get_features(df_test_lemma)

get_features(df_train_stem)
get_features(df_test_stem)
df_train

NameError: ignored

In [None]:
split_df = df_train['val'].str.split("[ .,\"]")
split_df

0        [Bromwell, High, is, a, cartoon, comedy, , It,...
1        [Homelessness, (or, Houselessness, as, George,...
2        [Brilliant, over-acting, by, Lesley, Ann, Warr...
3        [This, is, easily, the, most, underrated, film...
4        [This, is, not, the, typical, Mel, Brooks, fil...
                               ...                        
24995    [Towards, the, end, of, the, movie, , I, felt,...
24996    [This, is, the, kind, of, movie, that, my, ene...
24997    [I, saw, 'Descent', last, night, at, the, Stoc...
24998    [Some, films, that, you, pick, up, for, a, pou...
24999    [This, is, one, of, the, dumbest, films, , I'v...
Name: val, Length: 25000, dtype: object

In [71]:
df_train_1 = df_train_lemma[df_train_lemma['label'] == 1]
split_df = df_train_1['val'].str.split("[ .,\"]")
test = np.hstack(split_df)
unique, counts = np.unique(test, return_counts=True)
counts.sum()

2895864

In [58]:
def occurences_and_vocabulary(x_train, y_train, classes):
  dictionnary = {}
  for c in classes:
    dictionnary[c] = {}
  c = -1
  vocabulary = []
  for i in range(len(y_train)):
    c = y_train[i]
    splitted_doc = re.split("[ .,\"]", x_train[i])
    for word in splitted_doc:
      vocabulary.append(word)
      if word not in dictionnary[c]:
        dictionnary[c][word] = 1
      else:
        dictionnary[c][word] += 1
  vocabulary = list(set(vocabulary))
  return dictionnary, vocabulary

dictionnary, vocabulary = occurences_and_vocabulary(lemmas_train, y_train, [0, 1])
# print(vocabulary)

In [None]:
class NaiveBayes():
    def __init__(self, alpha=1.0):
        self.prior = None
        self.word_counts = None
        self.lk_word = None
        self.alpha = alpha
        
    def fit(self, x, y):
        '''
        Fit the features and the labels
        Calculate prior, word_counts and lk_word
        '''
        x, y = check_X_y(x, y)
        n = x.shape[0]
        
        # calculate the prior - number of text belonging to a particular class (real or fake)
        x_per_class = np.array([x[y == c] for c in np.unique(y)])
        self.prior = np.array([len(x_class) / n for x_class in x_per_class])
        
        # calculate the likelihood for each word 'lk_word'
        self.word_counts = np.array([sub_arr.sum(axis=0) for sub_arr in x_per_class]) + self.alpha
        self.lk_word = self.word_counts / self.word_counts.sum(axis=1).reshape(-1, 1)
        
        return self
    
    def _get_class_numerators(self, x):
        '''
        Calculate for each class, the likelihood that an entire message conditional
        on the message belonging to a particular class (real or fake)
        '''
        n, m = x.shape[0], self.prior.shape[0]
        
        class_numerators = np.zeros(shape=(n, m))
        for i, word in enumerate(x):
            word_exists = word.astype(bool)
            lk_words_present = self.lk_word[:, word_exists] ** word[word_exists]
            lk_message = (lk_words_present).prod(axis=1)
            class_numerators[i] = lk_message * self.prior
        
        return class_numerators
    
    def _normalized_conditional_probs(self, class_numerators):
        '''
        Conditional probabilities = class_numerators / normalize_term
        '''
        # normalize term is the likelihood of an entire message (addition of all words in a row)
        normalize_term = class_numerators.sum(axis=1).reshape(-1,1)
        conditional_probs = class_numerators / normalize_term
        assert(conditional_probs.sum(axis=1) - 1 < 0.001).all(), 'rows should sum to 1'
        
        return conditional_probs
    
    def predict_proba(self, x):
        '''
        Return the probabilities for each class (fake or real)
        '''
        class_numerators = self._get_class_numerators(x)
        conditional_probs = self._normalized_conditional_probs(class_numerators)
        
        return conditional_probs
    

    def predict(self, x):
        '''
        Return the answer with the highest probability (argmax)
        '''
        return self.predict_proba(x).argmax(axis=1)

In [None]:
vocabulary = []
_ = [vocabulary.extend(x.split()) for i,x in enumerate(df_train_lemma['val'])]
vocabulary = np.array(vocabulary)
vocab = np.unique(vocabulary)
print(vocab)
len(vocab)

['-PRON-' '0' '00' ... 'über' 'üvegtigris' '\ufeff1']


58159

In [None]:
X_train

In [None]:
def awdaw(x_train):
  vocabulary = []
  for i in range(len(x_train)):
    splitted_doc = re.split("[ .,\"]", x_train[i])
    for w in splitted_doc:
      if w not in vocabulary:
        vocabulary.append(w)
  return vocabulary

m = awdaw(x_train)

KeyboardInterrupt: ignored

AttributeError: ignored

In [59]:
def sum_counts(D):
    dicts = D.values()
    iterator = iter(dicts)
    sum_pos = sum(next(iterator).values())
    sum_neg = sum(next(iterator).values())
    return sum_pos, sum_neg

(2822436, 2895864)

In [113]:
def train_naive_bayes2(D, C):
    logprior = dict()
    bigdoc = dict()
    (data, target) = D
    count, vocabulary = occurences_and_vocabulary(data, target, C)
    ndoc = len(target)
    loglikelihood = dict()
    sum_pos, sum_neg = sum_counts(count)
    for c in C:
      nc = np.count_nonzero(target)
      logprior[c] = np.log(nc/ndoc)

      for w in vocabulary:
          if not c in loglikelihood:
              loglikelihood[c] = {}
              # somme des counts de w dans V + 1
          if w not in count[c]:
              loglikelihood[c][w] = 0
              continue
          if c == 0:
              loglikelihood[c][w] = np.log((count[c][w] + 1) / (sum_neg + 1))
          else:
              loglikelihood[c][w] = np.log((count[c][w] + 1) / (sum_pos + 1))
    return logprior, loglikelihood, vocabulary


logprior, loglikelihood, vocabulary = train_naive_bayes2((lemmas_train, y_train), [0, 1])

In [81]:
loglikelihood

{0: {'': -11.934355436212643,
  'gunghroo': -14.18564723481914,
  'townfolks': -14.18564723481914,
  'disaffected': -13.780182126710974,
  'castigate': -13.492500054259194,
  'innovator': -13.492500054259194,
  'summarily': -13.087034946151029,
  '130': -14.18564723481914,
  'kurupt': -13.087034946151029,
  'keanu': -11.620697877357602,
  'photochemical': -14.18564723481914,
  'uriah': -13.780182126710974,
  'floozy': -12.932884266323772,
  'kady': -13.492500054259194,
  'understandable': -11.241208255652698,
  'lejanos': -13.492500054259194,
  'ryuhei': -13.269356502944984,
  'romper': -13.492500054259194,
  'cryptically': -14.18564723481914,
  'autobiographical': -12.799352873699249,
  'circa': -11.883062141825093,
  'hoffa': -14.18564723481914,
  'ambrose': -14.18564723481914,
  'cavity': -13.269356502944984,
  'undifferentiated': -14.18564723481914,
  'awwww': -13.780182126710974,
  'welisch': -14.18564723481914,
  'hypothetical': -13.780182126710974,
  'farra': -14.18564723481914,

In [None]:
loglikelihood

{0: {'bromwell': 0,
  'high': -7.7638680990915985,
  'be': -3.11294663627239,
  'a': -3.2521565969755737,
  'cartoon': -8.899867919831328,
  'comedy': -7.4500501630290215,
  'it': -4.292699731180564,
  'run': -7.483510533592703,
  'at': -5.449344545616869,
  'the': -2.874711150314815,
  'same': -7.21829057637357,
  'time': -5.937544708408467,
  'some': -5.842808444742665,
  'other': -6.357550362829989,
  'program': -9.796865448770804,
  'about': -5.748353644717995,
  'school': -8.045176310419187,
  'life': -7.090515205579043,
  'such': -7.019511030458009,
  'teacher': -9.506003723401644,
  'my': -6.192857219862223,
  '35': -11.115441635835744,
  'year': -6.968534743522788,
  'in': -4.180441834747041,
  'teach': -9.611364239059471,
  'profession': -11.451913872456958,
  'lead': -7.599640871434585,
  'me': -6.244798469335391,
  'to': -3.7114668854769786,
  'believe': -7.424184059316841,
  'that': -4.2637529299938155,
  'satire': -10.016829347167635,
  'much': -5.203290895447488,
  'close

In [134]:
def test_naive_bayes(testdoc, logprior, loglikelihood, C, V):
    sum_ = [0, 0]
    for c in C:
        sum_[c] = logprior[c]
        for word in testdoc:
          try:
            sum_[c] += loglikelihood[c][word]
          except:
            pass
    return np.argmax(sum_)

In [139]:
accuracy = 0
print(len(split_df))
split_df = df_test_lemma['val'].str.split("[ .,\"]")
for i in range(len(split_df)):
    var = test_naive_bayes(split_df[i], logprior, loglikelihood, [0, 1], vocabulary)
    if var == y_test[i]:
        accuracy += 1
accuracy /= len(split_df)

25000


In [140]:
accuracy

0.69472

In [126]:
df_test_lemma

Unnamed: 0,val,label
0,i go and see this movie last night after be co...,1
1,actor turn director bill paxton follow up his ...,1
2,a a recreational golfer with some knowledge of...,1
3,i see this film in a sneak preview and it be d...,1
4,bill paxton have take the true story of the 19...,1
...,...,...
24995,i occasionally let my kid watch this garbage s...,0
24996,when all we have anymore be pretty much realit...,0
24997,the basic genre be a thriller intercut with a ...,0
24998,four thing intrigue me a to this film firstly ...,0


In [92]:
var = test_naive_bayes(split_df[0], logprior, loglikelihood, [0, 1], vocabulary)

i
[-5.016311165647023, 0]
go
[-10.965802738783037, 0]
and
[-14.637205493175115, 0]
see
[-20.190101195943743, 0]
this
[-24.418246289920518, 0]
movie
[-29.044623123397187, 0]
last
[-36.62362017201811, 0]
night
[-44.47865563265875, 0]
after
[-51.13893247054824, 0]
be
[-54.2775622680806, 0]
coax
[-67.36459721423162, 0]
to
[-71.10174726096858, 0]
by
[-76.72561042104361, 0]
a
[-80.00345017927916, 0]
few
[-87.20068423209871, 0]
friend
[-94.83668072468404, 0]
of
[-98.57522334747615, 0]
mine
[-108.5487429844168, 0]
-PRON-
[-113.50547346168682, 0]
admit
[-122.47890602901133, 0]
that
[-126.76834212026512, 0]
i
[-131.0915061053522, 0]
be
[-134.23013590288457, 0]
reluctant
[-145.85083378024217, 0]
to
[-149.58798382697913, 0]
see
[-155.14087952974776, 0]
it
[-159.4592624221883, 0]
because
[-165.83394888569978, 0]
from
[-171.53875575856847, 0]
what
[-177.41325044538593, 0]
i
[-181.73641443047302, 0]
know
[-188.15078371997453, 0]
of
[-191.88932634276662, 0]
ashton
[-204.97636128891764, 0]
kutcher
[-21

In [93]:
df_test_lemma

Unnamed: 0,val,label
0,i go and see this movie last night after be co...,1
1,actor turn director bill paxton follow up his ...,1
2,a a recreational golfer with some knowledge of...,1
3,i see this film in a sneak preview and it be d...,1
4,bill paxton have take the true story of the 19...,1
...,...,...
24995,i occasionally let my kid watch this garbage s...,0
24996,when all we have anymore be pretty much realit...,0
24997,the basic genre be a thriller intercut with a ...,0
24998,four thing intrigue me a to this film firstly ...,0


In [94]:
var

1