<a href="https://colab.research.google.com/github/SamaSamrin/NLP-TF-IDF/blob/main/NLP_Ass4_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from collections import defaultdict, Counter
import math
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from typing import List

### Enter your Name ID in below variables. They must be strings.
ID = "1191609"
NAME = "Sama Samrin"

In [2]:
class CustomTFIDF:
    def __init__(self, data: List[List[str]]):
        #data is a list of lists which consists of words. For example = [["it", "consist", "of", "words"]]
        self.data = data
        #Map to store words to indexes in the vocab.
        self.word_to_index = {}
        #Map to store inverse document frequency for each unique word in vocab.
        self.idfs_ = {}
        #Vocab stores all the unique words in the dataset.
        self.vocab = set()
    
    def _build_vocab(self):
        """Method to build vocabulary. Vocabulary is a list of unique words in the dataset and it is alphabetically sorted. 
        Hint: Use the self.vocab to store unique words
        Note: Include only words whose length is >= 2
        """
        corpus = self.data
        words = set()

        for i in range(len(corpus)):
          for j in range(len(corpus[i])):
            x = corpus[i][j-1].split(" ")
            for k in range(len(x)):
              if x[k] not in words and len(x[k])>=2:
                words.add(x[k])

        self.vocab = words
        #Implement your logic above this line
        #Do not modify the below line
        self.vocab = sorted(list(self.vocab))
    
    def create_index_map(self):
        """Method to map every word to its index in the vocabulary
        Hint: Iterate over the vocabulary and store them in word_to_index
        """
        #Implement the logic below this line
        words = self.vocab
        word_to_index = dict.fromkeys(words)
        count = 0

        for i in words:
            word_to_index[i] = count
            count += 1

        self.word_to_index = word_to_index
        
    def calculate_idfs(self):
        """Method to calculate inverse document frequency. Use the below formula to calculate inverse document frequency
        **for assignment 4, use this formula to calculate idfs (the previous one had the wrong brackets.) :**
        idf = 1.0 + math.log[(1+length of data) / (1+number of documents with the term)]**
        **The formula in the code file is missing a bracket. This formula will ensure you won't run into ZeroDivisionErrors.**
        Reference: http://www.tfidf.com/
        Hint: Iterate over the vocab and check if word occurs in sentence. Count the occurence and store them in self.idfs_ dictionary. 
        """
        
        #implement your logic below this line
        words = self.vocab
        corpus = self.data
        self.idfs_= dict.fromkeys(words, 0)
        
        #calculating total number of words in the entire corpus
        total_num_of_terms_in_doc = 0
        for i in range(len(corpus)):
          total_num_of_terms_in_doc += len(corpus[i])

        # IDF = how many documents have the word 
        number_of_docs_that_contain_the_word = dict.fromkeys(words,0)

        for word in words:
          number_of_docs_with_the_word = 0
          for doc in corpus:
            if word in doc:
              number_of_docs_that_contain_the_word[word] += 1
            self.idfs_[word] = math.log(((len(corpus)+1)/(1+number_of_docs_that_contain_the_word[word]))) + 1.0
        
    def tfidf(self, input_sent: List[str]) -> np.ndarray:
        """Method which accepts input_sent of the form : ["is", "this", "a", "sent"].
        Task:
            Implement the tfidf approach for the above input. 
            This function should return a vector of counts for the given inputs.
            The output shape of the vector should be 1 x len(self.vocab)
            The output should be a numpy array
            Example:
                length of vocabulary = 10
                input_sent = ["is", "this", "a", "sent"]
                The method should return a vector of shape 1 x 10.
            Formula:
                TF(t) = (Number of times word t appears in a document) / (Total number of words in the document).
                IDF will already be calculated and stored in self.idfs_ variable.
        """

        #Implement your logic below this line
        words = self.vocab
        vector_shape = (1, len(words))
        vector = np.zeros(vector_shape, dtype=float)

        for word in input_sent:
          if word not in words:
            input_index = np.where(input_sent == word)
          elif word in self.word_to_index:
            count = input_sent.count(word)
            tf = count/len(input_sent)
            indx = self.word_to_index[word] #getting index of i (word of input_sent) from our index map
            vector[0][indx] = tf * self.idfs_[word]
            
        return vector[0]
    
    def create_vector(self):
        #Do not modify this function
        #hint: if you understand this function, you will understand what steps you will need to implement first.
        self._build_vocab()
        self.create_index_map()
        self.calculate_idfs()
        vector = []
        for sent in self.data:
            sent_vector = self.tfidf(sent)
            vector.append(sent_vector)
        vector = np.array(vector)
        vector = normalize(vector)
        return vector

In [3]:
#Do not modify code below this. If this modified you will be given straight away 0.
class testApproach:
    def __init__(self):
        self.original_corpus = ["this is a document", "this is a processed document", "is this a document", "This is not a document"]
        self.corpus = self.process(self.original_corpus)
        self.sklearn_bow = CountVectorizer()
        self.sklearn_transformer = TfidfTransformer()
        self.custom_tfidf = CustomTFIDF(self.corpus)

    def process(self, corpus):
        corpus = [x.lower() for x in corpus]
        return [x.split() for x in corpus]
    
    def testTFIDF(self):
        sklearn_output = self.sklearn_bow.fit_transform(self.original_corpus)
        sklearn_output = self.sklearn_transformer.fit_transform(sklearn_output).toarray()
        print("Sklearn Output = \n", sklearn_output, "\n")
        custom_output = self.custom_tfidf.create_vector()
        print("Our Output = \n", custom_output, "\n")
        try:
            is_correct = np.allclose(sklearn_output, custom_output) 
        except:
            is_correct = False
        if is_correct is True:
            print(f"ID : {ID} | Name : {NAME} | ALL test cases passed.")
            print("===="*20)
        else:
            print(f"ID : {ID} | Name : {NAME} | ALL test cases Failed.")
            print("===="*20)

if __name__ == "__main__":
    tester = testApproach()
    tester.testTFIDF()

Sklearn Output = 
 [[0.57735027 0.57735027 0.         0.         0.57735027]
 [0.38713857 0.38713857 0.         0.74187006 0.38713857]
 [0.57735027 0.57735027 0.         0.         0.57735027]
 [0.38713857 0.38713857 0.74187006 0.         0.38713857]] 

Our Output = 
 [[0.57735027 0.57735027 0.         0.         0.57735027]
 [0.38713857 0.38713857 0.         0.74187006 0.38713857]
 [0.57735027 0.57735027 0.         0.         0.57735027]
 [0.38713857 0.38713857 0.74187006 0.         0.38713857]] 

ID : 1191609 | Name : Sama Samrin | ALL test cases passed.
