# First Part of Project

In [163]:
import numpy as np
from tqdm import tqdm

### Importing IMDB sentiment dataset

In [164]:
import keras
from keras.datasets import imdb

In [165]:
# LOAD IMDB DATA

(x_train, y_train), (x_test, y_test) = imdb.load_data()

In [166]:
print("train_data ", x_train.shape)
print("train_labels ", y_train.shape)
print("_"*100)
print("test_data ", x_test.shape)
print("test_labels ", y_test.shape)
print("_"*100)
print("Maximum value of a word index ")
print(max([max(sequence) for sequence in x_train]))
print("Maximum length num words of review in train ")
print(max([len(sequence) for sequence in x_train]))

train_data  (25000,)
train_labels  (25000,)
____________________________________________________________________________________________________
test_data  (25000,)
test_labels  (25000,)
____________________________________________________________________________________________________
Maximum value of a word index 
88586
Maximum length num words of review in train 
2494


In [167]:
# Retrieve the word index file mapping words to indices
word_index = keras.datasets.imdb.get_word_index()

In [168]:
# Reverse the word index to obtain a dict mapping indices to words
inverted_word_index = dict((i, word) for (word, i) in word_index.items())
# Decode the first sequence in the dataset
decoded_sequence = " ".join(inverted_word_index[i] for i in x_train[0])

In [169]:
decoded_sequence

"the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room titillate it so heart shows to years of every never going villaronga help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but pratfalls to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other tricky in of seen over landed for anyone of gilmore's br show's to whether from than out themselves history he name half some br of 'n odd was two most of mean for 1 any an boat she he should is thought frog but of script you not while history he heart to real at barrel but wh

### Naive Bayes classifier

In [170]:
def mean(occurence):
    return sum(occurence)/float(len(occurence))

In [171]:
# Find the review that are belonging to c class
def get_review_from_c(dataset, _class):
    L = []
    for i in range(len(dataset)):
        if (y_train[i] == _class):
            L.append(x_train[i])
    return L

In [295]:
def get_reviews_from_c_opti(x_train, y_train, _class, V):
    reviews = x_train[y_train == _class]
    
    for i in range (len(reviews)):
        
        # Decode sequence in the dataset
        decoded_review = " ".join(V[i] for i in reviews[i])
        reviews[i] = decoded_review
        
    return reviews

In [316]:
def get_number_occurence_in_bigdoc(bigdoc, word, V):
    count = 0
    
    for review in bigdoc:
        
        # Binary naive Bayes ?
        if word in review:
            count += review.count(word)
            
    return count

In [317]:
def get_number_word(dataset):
    count = 0
    for review in dataset:
        count += len(review)
        
    return count

In [322]:
def remove_stop_words(dataset):
    new_vocab = dict()
    for value, key in dataset.items():
        if key in ["the", "a", "le", "la", "un", "une"]:
            continue
        else:
            new_vocab[key] = value
    return new_vocab

In [323]:
def train_naive_bayes(dataset, classes):
    V = keras.datasets.imdb.get_word_index()

    logPrior = dict()
    bigdoc = dict()
    
    logLikelihood = np.zeros((len(V), len(classes)))
    
    # Preprocessing 
    V = remove_stop_words(V)
    
    
    
    for _class in classes:
        N_doc = len(dataset)
        review_from_c = get_reviews_from_c_opti(x_train, y_train, _class, V)
        N_c = len(review_from_c)
        logPrior[_class] = np.log(N_c / N_doc)
        
        bigdoc[_class] = review_from_c
        sum_occurence = get_number_word(review_from_c)
        count = 0
        for key, value in tqdm(V.items()):
            
            count_ = get_number_occurence_in_bigdoc(review_from_c, value, V)
            
            logLikelihood[key, _class] = np.log((count_ + 1) / (sum_occurence + 1))
            
            count += 1
            
    return (logPrior, logLikelihood, V)
            

In [324]:
train_naive_bayes(x_train, [0,1])

  0%|          | 191/88584 [00:04<35:52, 41.06it/s]


KeyboardInterrupt: 

In [321]:
def test_naive_bayes(testdoc, logPrior, logLikelihood, classes, vocabulary):
    _sum = dict()
    
    for _class in classes:
        _sum[_class] = logPrior[_class]