<a href="https://colab.research.google.com/github/SamaSamrin/NLP-Bag-of-Words-Implementation-Attempt/blob/main/Final_BOW_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from typing import List


### Enter your Name ID in below variables. They must be strings.
ID = "1191609"
NAME = "Sama Samrin"



class CustomBagOfWords:
    def __init__(self, data: List[List[str]]):
        #data is a list of lists which consists of words. For example = [["it", "consist", "of", "words"]]
        self.data = data 
        #Map to store words to indexes in the vocab
        self.word_to_index = {}
        #Vocab stores all the unique words in the dataset.
        self.vocab = set()
    
    
    def _build_vocab(self):
        """Method to build vocabulary. Vocabulary is a list of unique words in the dataset and it is alphabetically sorted. 
        Hint: Use the self.vocab to store unique words
        Note: Include only words whose length is >= 2
        """
        corpus = self.data
        words = set()
        #print("corpus : ", corpus)

        for i in range(len(corpus)):
          #print("i: ", i)
          for j in range(len(corpus[i])):
            x = corpus[i][j-1].split(" ")
            for k in range(len(x)):
              if x[k] not in words and len(x[k])>=2:
                words.add(x[k])

        #print(words)
        self.vocab = words
        #Implement your logic above this line
        #Do not modify the below line
        self.vocab = sorted(list(self.vocab))
    
    def create_index_map(self):
        """Method to map every word to its index in the vocabulary
        Hint: Iterate over the vocabulary and store them in word_to_index
        """
        #Implement the logic below this line
        words = self.vocab
        word_to_index = dict.fromkeys(words)
        count = 0

        for i in words:
            word_to_index[i] = count
            count += 1

            self.word_to_index = word_to_index
    
    def bow(self, input_sent: List[str]) -> np.ndarray:
        """Method which accepts input_sent of the form : ["is", "this", "a", "sent"].
        Task:
            Implement the bag of words approach for the above input. 
            This function should return a vector of counts for the given inputs.
            The output shape of the vector should be 1 x len(self.vocab)
            The output should be a numpy array
            Example:
                length of vocabulary = 10
                input_sent = ["is", "this", "a", "sent"]
                The method should return a vector of shape 1 x 10.
        """
        words = self.vocab
        #print(words)
        vector_shape = (1, len(words))
        #print("vector shape =", vector_shape)
        vector = np.zeros(vector_shape, dtype=int)
        #print(vector)

        #input_sent = np.array(["is", "this", "a", "sent"])
        #print("input sent = ", input_sent)

        for i in input_sent:
          #print("i =", i)
          if i not in words:
            input_index = np.where(input_sent == i)
            #print("OOV error with the word: *", i, "* at ", input_index[0])
          elif i in self.word_to_index:
            input_index = np.where(input_sent == i)
            #print(i, "found at input index =", input_index)
            indx = self.word_to_index[i] #getting index of i (word of input_sent) from our index map
            #print("the word is mapped at indx =", indx) 
            #print("vector shape = ", vector.shape)
            #print("inside if", "vector[0][indx] =", vector[0][indx])
            vector[0][indx] += 1
        #print(vector)
        vector = vector[0]
        return vector
        #pass
    
    def create_vector(self) -> np.ndarray:
        """Method iterates over the whole dataset and creates the bag of words vector of shape:  len(self.data) x len(self.vocab)"""
        #Do not modify this function
        self._build_vocab()
        self.create_index_map()
        vector = []
        for sent in self.data:
            sent_vector = self.bow(sent)
            vector.append(sent_vector)
        vector = np.array(vector)
        return vector


#Do not modify code below this. If this modified you will be given straight away 0.
class testApproach:
    def __init__(self):
        self.original_corpus = ["this is a document", "this is a processed document", "is this a document", "This is not a document"]
        self.corpus = self.process(self.original_corpus)
        self.sklearn_bow = CountVectorizer()
        self.custom_bow = CustomBagOfWords(self.corpus)

    def process(self, corpus):
        corpus = [x.lower() for x in corpus]
        return [x.split() for x in corpus]
    
    def testBOW(self):
        sklearn_output = self.sklearn_bow.fit_transform(self.original_corpus).toarray()
        custom_output = self.custom_bow.create_vector()
        
        try:
            is_correct = np.allclose(sklearn_output, custom_output) 
        except:
            is_correct = False
        if is_correct is True:
            print(f"ID : {ID} | Name : {NAME} | ALL test cases passed.")
            print("===="*20)
        else:
            print(f"ID : {ID} | Name : {NAME} | ALL test cases Failed.")
            print("===="*20)
    
if __name__ == "__main__":
    tester = testApproach()
    tester.testBOW()

ID : 1191609 | Name : Sama Samrin | ALL test cases passed.
