# Assignment-03

In [None]:
%matplotlib inline

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import matplotlib.pyplot as plt
from random import shuffle

In [2]:
class complete_imdb_vocab:
    def __init__(self, filename):
        with open(filename, encoding="utf8") as f:
            self.words = f.read().splitlines()
        self.inverted_index_map = {x: i for i, x in enumerate(self.words)}
        self.stopwords = set(stopwords.words('english'))

    def stopword_dict(self, word):
        return word in self.stopwords

    def indexed_stpwrds(self, index):
        return self.words[index] in self.stopwords

    def word_index(self, item):
        return self.inverted_index_map.get(item, -1)
    
    
    
class Features:
    reg_obj = re.compile(r"((\d+):(\d+))")

    def __init__(self, num_rate: int, word_frequency: dict):
        self.old_rating: int = num_rate
        self.num_rate: int = 1 if num_rate > 5 else -1
        self.word_frequency: dict = word_frequency.copy()

    def __contains__(self, item):
        return item in self.word_frequency

    def __iter__(self):
        return iter(self.word_frequency.keys())

    @classmethod
    def read(cls, line: str):
        line = line.split(None, 1)
        num_rate = int(line[0])
        words = '' if len(line) == 1 else line[1]
        found = list(cls.reg_obj.finditer(words))

        word_frequency = {}
        for match in found:
            index, times = match.group(2), match.group(3)
            word_frequency[int(index)] = int(times)

        return cls(num_rate, word_frequency)


    
    
class Collection:
    def __init__(self, reviews: list):
        self.all = reviews.copy()
        self.pos_review = [r for r in self.all if r.old_rating > 5]
        self.neg_review = [r for r in self.all if r.old_rating < 5]

    def __iter__(self):
        return iter(self.all)

    @classmethod
    def read(cls, filename: str):
        reviews = []
        with open(filename) as f:
            for _, line in enumerate(f):
                review = Features.read(line)
                reviews.append(review)

        return cls(reviews)

    def shuffle(self):
        copy = self.all.copy()
        shuffle(copy)
        return Collection(copy)

    def total_count(self, index):
        return sum(map(lambda x: index in x, self.all))

    def count_positive(self, index):
        return sum(map(lambda x: index in x, self.pos_review))

    def count_negative(self, index):
        return sum(map(lambda x: index in x, self.neg_review))

    def copy(self):
        return Collection(self.all)

def multi_vald(l: list, levels):
    z = len(l) // levels
    for i in range(0, len(l), z):
        test = l[i:i + z].copy()
        train = l.copy()
        del train[i:i + z]
        yield train, test


        
class Inputs:
    def __init__(self, train: Collection, dev: Collection, test: Collection):
        self.train: Collection = train.copy()
        self.dev: Collection = dev.copy()
        self.test: Collection = test.copy()
        self.all_train: Collection = Collection(self.train.all + self.dev.all)

    @classmethod
    def train_input(cls, folder: str, n_splits=5):
        train = Collection.read(rf'trainlabeledBow.feat').shuffle()

        for train2, dev in multi_vald(train.all, n_splits):
            yield cls(Collection(train2), Collection(dev), Collection([]))

    @classmethod
    def fetch(cls, folder: str, n_splits=5):
        train = Collection.read(rf'trainlabeledBow.feat')
        test = Collection.read(rf'testlabeledBow.feat')
        review_set = train.all + test.all
        shuffle(review_set)

        for train, test in multi_vald(review_set, n_splits):
            yield cls(Collection(train), Collection([]), Collection(test))

    def copy(self):
        return Inputs(self.train, self.dev, self.test)    

def mean(models: list, data: list, smooth: float = 1, min_app: float = 0):
    accuracy = 0
    for i, model in enumerate(models):
        cur = model.accuracy(data[i], smooth, min_app)
        accuracy += cur
    accuracy /= len(models)
    return accuracy



class NBclass:
    def __init__(self, data: Collection, new_gen_vocab: complete_imdb_vocab):
        self.data: Collection = data.copy()
        self.new_gen_vocab = new_gen_vocab

        self.calculate(self.data)

    def calculate(self, reviews: Collection):
        self.words = set()
        for review in reviews.all:
            self.words.update(review)

        self.positive = {word: 0 for word in self.words}
        self.negative = {word: 0 for word in self.words}

        for review in reviews.pos_review:
            for word in review:
                self.positive[word] += 1
        for review in reviews.neg_review:
            for word in review:
                self.negative[word] += 1

    def prediction(self, review: Features, smooth: float = 0, min_app: float = 0):
        pred_pos, pred_neg = len(self.data.pos_review), len(self.data.neg_review)
        min_total = len(self.data.all) * min_app
        for word in review:
            if self.new_gen_vocab.indexed_stpwrds(word):
                continue

            pos_review = self.positive.get(word, 0)  
            neg_review = self.negative.get(word, 0)  
            total = pos_review + neg_review
            if total == 0 or total < min_total: 
                continue

            pred_pos *= (pos_review + smooth) / (total + smooth * 2)  
            pred_neg *= (neg_review + smooth) / (total + smooth * 2)
            if pred_pos == 0 or pred_neg == 0:
                break

        return 1 if pred_pos > pred_neg else -1

    def accuracy(self, reviews: Collection, smooth: float = 0, min_app: float = 0):
        accurate, total = 0, 0
        for review in reviews:
            if review.num_rate == self.prediction(review, smooth, min_app):
                accurate += 1
            total += 1

        return accurate / total

    def most_freq(self, top_count=10, min_app=0.001):
        total_pos, total_neg = len(self.data.pos_review), len(self.data.neg_review)
        freq_pos = total_pos / (total_pos + total_neg)
        freq_neg = total_neg / (total_pos + total_neg)

        pred_pos, pred_neg, polarity = {}, {}, {}
        for word in self.words:
            if self.new_gen_vocab.indexed_stpwrds(word):
                continue

            num_pos = self.positive.get(word, 0)
            num_neg = self.negative.get(word, 0)
            count_total = num_pos + num_neg
            if count_total < len(self.data.all) * min_app:
                continue

            str_word = self.new_gen_vocab.words[word]
            pred_pos[str_word] = freq_pos * num_pos / count_total
            pred_neg[str_word] = freq_neg * num_neg / count_total
            polarity[str_word] = (num_pos - num_neg ) / count_total

        top_pos = sorted(pred_pos.items(), key=lambda item: item[1], reverse=True)
        pos_words = [x[0] for x in top_pos]
        top_neg = sorted(pred_neg.items(), key=lambda item: item[1], reverse=True)
        neg_words = [x[0] for x in top_neg]

        return pos_words[:top_count], neg_words[:top_count]
    

<B>New Vocabulary generated using ntlk to remove all the stopwords.</B>

In [3]:
new_gen_vocab = complete_imdb_vocab(r'imdb.vocab')

In [21]:
print(f'Stopwords: {new_gen_vocab.stopwords}')

Stopwords: {'down', 'had', 'how', 'that', 'each', 'on', 'too', 'then', 'were', "haven't", 'aren', 'be', 'd', 'further', 'as', 'nor', 'doing', 'more', "doesn't", 'above', 'are', 'about', 'yourself', "didn't", 'his', 'to', 'these', 'they', 'by', 'needn', 'once', 'an', 'because', 'doesn', 'haven', "shan't", 'at', 'them', 'theirs', 'over', 'until', 'before', "should've", 'which', 'what', 'been', 'you', 'me', 'did', 'y', 'him', 'hadn', "you've", 'she', 'do', 'couldn', 'shouldn', 'and', 'there', 'hers', 'its', 'won', "she's", "that'll", 'weren', 'between', 'ma', 'below', "it's", 'our', 'all', 'most', 'when', 'but', 'own', 'he', "mustn't", 'so', 'their', "couldn't", 'of', "needn't", 'here', 'mightn', 'out', 'through', 'shan', 'has', 'for', 'both', 'don', 'why', 'yourselves', 'now', 'her', 'same', 'your', 'having', 'themselves', 'a', 'herself', 'm', "mightn't", 'into', 'hasn', 'very', 'whom', 'this', 's', "you'll", 'during', 'than', "weren't", 't', 'll', 'such', 'off', "shouldn't", "wasn't", '

<B>Models generated for 5-fold cross-validation</B>

In [22]:
levels = 5
dataset = list(Inputs.train_input('aclImdb', levels))
models = [NBclass(x.train, new_gen_vocab) for x in dataset]
print(f'{len(models)} models created for cross-validation')

5 models created for cross-validation


In [7]:
reviews = dataset[0].all_train
index_the = new_gen_vocab.word_index('the')
index_magnificent = new_gen_vocab.word_index('magnificent')
index_poor = new_gen_vocab.word_index('poor')

<B>Calculating Probability for "the" ; P[the] = num of documents containing ‘the’ / all documents</B>

In [23]:
print(f'P["the"] = {reviews.total_count(index_the) / len(reviews.all):0.4f}')

P["the"] = 0.9917


Also calculating probablity of two exteremely positve and negative words <i>"magnificent"</i> and <i>"poor"</i> respectively, just to see the results.

In [8]:
print(f'P["magnificent"] = {reviews.total_count(index_magnificent) / len(reviews.all):0.4f}')
print(f'P["poor"] = {reviews.total_count(index_poor) / len(reviews.all):0.4f}')

P["magnificent"] = 0.0097
P["poor"] = 0.0635


Calculating <B>P[“the” | Positive]</B> = # of positive documents containing “the” / num of all positive documents.

Calculating <B>P[“the” | Negative]</B> = # of positive documents containing “the” / num of all positive documents.

In [24]:
print(f'P["the" | Positive] = {reviews.count_positive(index_the) / len(reviews.pos_review):0.4f}')
print(f'P["the" | Negative] = {reviews.count_negative(index_the) / len(reviews.pos_review):0.4f}')

P["the" | Positive] = 0.9905
P["the" | Negative] = 0.9929


Doing the same for <i>"magnificent"</i> and <i>"poor"</i>

In [10]:
print(f'P["magnificent" | Positive] = {reviews.count_positive(index_magnificent) / len(reviews.pos_review):0.4f}')
print(f'P["magnificent" | Negative] = {reviews.count_negative(index_magnificent) / len(reviews.pos_review):0.4f}')

print(f'P["poor" | Positive] = {reviews.count_positive(index_poor) / len(reviews.pos_review):0.4f}')
print(f'P["poor" | Negative] = {reviews.count_negative(index_poor) / len(reviews.pos_review):0.4f}')

P["magnificent" | Positive] = 0.0159
P["magnificent" | Negative] = 0.0035
P["poor" | Positive] = 0.0294
P["poor" | Negative] = 0.0975


<B>Calculation avegrage accuracy on development data models without any smoothing and with zero minimum appearance.</B>

In [11]:
dev_data = [x.dev for x in dataset]
accuracy = mean(models, dev_data, smooth=0, min_app=0)
print(f'Average accuracy for dev data = {accuracy:0.4f}')

Average accuracy for dev data = 0.7514


<B>Calculating the average accuracy on development data without any smoothing but minimum 5 appearances of a word.</B>

In [12]:
dev_data = [x.dev for x in dataset]
accuracy = mean(models, dev_data, smooth=0, min_app=0.00025)
print(f'Average accuracy = {accuracy:0.4f}')

Average accuracy = 0.8343


<B>Calculating the average accuracy on development data while experimenting with smoothing hyperparameters in range <i>[0, 1]</i> with step size <i>"0.1"</i></B>

In [13]:
hyp_par = {}
for i in (x * 0.1 for x in range(0, 11)):
    hyp_par[i] = mean(models, dev_data, smooth=i, min_app=0.00025)
smoothing_accuracies = pd.DataFrame.from_dict(hyp_par, orient='index', columns=['Accuracy'])
smoothing_accuracies

Unnamed: 0,Accuracy
0.0,0.83432
0.1,0.85536
0.2,0.85596
0.3,0.85624
0.4,0.85636
0.5,0.85644
0.6,0.85648
0.7,0.85676
0.8,0.85696
0.9,0.8574


<i>The accuracy values didn't really change after <b>"0.1"</b>, so it's clear that <b>Too-much of smoothing doesn't really have a drastic effect on accuracy</b></i>

<B>For calculation final accuracy on our test-data smoothing value is set to <i>"0.5"</i> and minimum appearance is set to <i>"5"</i> times.</B>

In [16]:
dataset = list(Inputs.fetch('aclImdb', levels))
models = [NBclass(x.train, new_gen_vocab) for x in dataset]
test_data = [x.test for x in dataset]
accuracy = mean(models, test_data, smooth=0.5, min_app=0.00025)
print(f'Final accuracy on test_data = {accuracy:.4f}')

Final accuracy on test_data = 0.8566


<B>Lastly, the TOP 10 Positive and Negative words</B>

In [19]:
pos_words, neg_words = models[0].most_freq(top_count=10, min_app=0.0025)
print(f'Top 10 Positive words:\n{pos_words}\n\n')
print(f'Top 10 Negative words:\n{neg_words}')

Top 10 Positive words:
['flawless', 'superbly', 'captures', 'wonderfully', 'must-see', 'timeless', 'understated', 'perfection', 'loneliness', 'underrated']


Top 10 Negative words:
['stinker', 'ugh', 'incoherent', 'waste', 'unfunny', 'unwatchable', 'wasting', 'atrocious', 'sub-par', 'redeeming']
