<a href="https://colab.research.google.com/github/Rumaizakosar/Eight-Puzzle-Code/blob/main/TFIDF_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import numpy as np
from collections import defaultdict, Counter
import math
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from typing import List

### Enter your Name ID in below variables. They must be strings.
ID = "1191609"
NAME = "Sama Samrin"

In [9]:
class CustomTFIDF:
    def __init__(self, data: List[List[str]]):
        #data is a list of lists which consists of words. For example = [["it", "consist", "of", "words"]]
        self.data = data
        #Map to store words to indexes in the vocab.
        self.word_to_index = {}
        #Map to store inverse document frequency for each unique word in vocab.
        self.idfs_ = {}
        #Vocab stores all the unique words in the dataset.
        self.vocab = set()

    def _build_vocab(self):
        """Method to build vocabulary. Vocabulary is a list of unique words in the dataset and it is alphabetically sorted.
        Hint: Use the self.vocab to store unique words
        Note: Include only words whose length is >= 2
        """
        corpus = self.data
        words = set()
        #print("corpus : ", corpus)

        for i in range(len(corpus)):
          #print("i: ", i)
          for j in range(len(corpus[i])):
            x = corpus[i][j-1].split(" ")
            for k in range(len(x)):
              if x[k] not in words and len(x[k])>=2:
                words.add(x[k])

        #print(words)
        self.vocab = words
        #Implement your logic above this line
        #Do not modify the below line
        self.vocab = sorted(list(self.vocab))

    def create_index_map(self):
        """Method to map every word to its index in the vocabulary
        Hint: Iterate over the vocabulary and store them in word_to_index
        """
        #Implement the logic below this line
        words = self.vocab
        word_to_index = dict.fromkeys(words)
        count = 0

        for i in words:
            word_to_index[i] = count
            count += 1

        self.word_to_index = word_to_index

    def calculate_idfs(self):
        """Method to calculate inverse document frequency. Use the below formula to calculate inverse document frequency
        **for assignment 4, use this formula to calculate idfs (the previous one had the wrong brackets.) :**
        idf = 1.0 + math.log[(1+length of data) / (1+number of documents with the term)]**
        **The formula in the code file is missing a bracket. This formula will ensure you won't run into ZeroDivisionErrors.**
        Reference: http://www.tfidf.com/
        Hint: Iterate over the vocab and check if word occurs in sentence. Count the occurence and store them in self.idfs_ dictionary.
        """

        #implement your logic below this line
        words = self.vocab
        corpus = self.data
        self.idfs_= dict.fromkeys(words, 0)
        #print("idfs map:", self.idfs_)
        #total_num_of_terms_in_doc = len(words)

        #calculating total number of words in the entire corpus
        total_num_of_terms_in_doc = 0
        for i in range(len(corpus)):
          total_num_of_terms_in_doc += len(corpus[i])
        print("Total number of words in the entire corpus =", total_num_of_terms_in_doc)
        #i = 0

        # IDF = how many documents have the word
        number_of_docs_that_contain_the_word = dict.fromkeys(words,0)

        for word in words:
          number_of_docs_with_the_word = 0
          for doc in corpus:
            if word in doc:
              #print(word, "is in", doc)
              number_of_docs_that_contain_the_word[word] += 1
            self.idfs_[word] = math.log(((len(corpus)+1)/(1+number_of_docs_that_contain_the_word[word]))) + 1.0

        #counting idfs

        # total_number_of_documents = len(corpus)
        # for word in words:
        #   #print("word=", word)
        #   print("total_number_of_documents =", total_number_of_documents, "number_of_docs_that_contain_the_word = ", number_of_docs_that_contain_the_word[word])
        #   log_val = math.log(((total_number_of_documents+1)/(1+number_of_docs_that_contain_the_word[word])), 2)
        #   print("log value =", log_val, "for word=", word)
        #   idf_ = 1.0 + log_val
        #   print("idf of", word,"is", idf_)
        #   self.idfs_[word] = idf_

        # for word in words:
        #   for i in range(len(corpus)):
        #     #print("i=", i, ", corpus[i] length = ", len(corpus[i]), ", values = ", corpus[i])
        #     freq = corpus[i].count(word)
        #     #print("*", word, "* appears", freq, "times in", corpus[i])
        #     self.idfs_[word] += freq
        #     #print("frequency of ", word, "in idfs :", idfs[word])
        #   #print("frequency of ", word, "in idfs :", self.idfs_[word])
        #     #for j in range(len(corpus[i])):
        #       #print("i=", i, "j=", j, "corpus[i][j] length = ", len(corpus[i][j]))
        #       #if word == corpus[i][j]:

        print("idfs map:", self.idfs_)

    def tfidf(self, input_sent: List[str]) -> np.ndarray:
        """Method which accepts input_sent of the form : ["is", "this", "a", "sent"].
        Task:
            Implement the tfidf approach for the above input.
            This function should return a vector of counts for the given inputs.
            The output shape of the vector should be 1 x len(self.vocab)
            The output should be a numpy array
            Example:
                length of vocabulary = 10
                input_sent = ["is", "this", "a", "sent"]
                The method should return a vector of shape 1 x 10.
            Formula:
                TF(t) = (Number of times word t appears in a document) / (Total number of words in the document).
                IDF will already be calculated and stored in self.idfs_ variable.
        """

        #Implement your logic below this line
        #input_sent = ["is", "this", "a", "sent"]
        words = self.vocab
        vector_shape = (1, len(words))
        #print("vector shape =", vector_shape)
        vector = np.zeros(vector_shape, dtype=float)
        #print(vector)

        for word in input_sent:
          #print("word =", word)
          if word not in words:
            input_index = np.where(input_sent == word)
            #print("OOV error with the word: *", word)
          elif word in self.word_to_index:
            #print("idfs[word] for", word, "is", idfs[word])
            #tf = self.idfs_[word]/len(input_sent)
            count = input_sent.count(word)
            tf = count/len(input_sent)
            #print("Term Frequency for", word, "is", tf)
            #input_index = np.where(input_sent == word)
            #print(i, "found at input index =", input_index)
            indx = self.word_to_index[word] #getting index of i (word of input_sent) from our index map
            #print("the word is mapped at indx =", indx)
            #print("vector shape = ", vector.shape)
            #print("inside if", "vector[0][indx] =", vector[0][indx])
            vector[0][indx] = tf * self.idfs_[word]
            #print("count=", count, "TF=", tf, "IDF=", self.idfs_[word], "vector value=", vector[0][indx])

        print(vector)
        return vector[0]

    def create_vector(self):
        #Do not modify this function
        #hint: if you understand this function, you will understand what steps you will need to implement first.
        self._build_vocab()
        self.create_index_map()
        self.calculate_idfs()
        vector = []
        for sent in self.data:
            sent_vector = self.tfidf(sent)
            vector.append(sent_vector)
        vector = np.array(vector)
        vector = normalize(vector)
        return vector

**My Attempts**

In [10]:
corpus = [["this", "is", "a", "document"], ["this", "is", "a", "processed", "document"], ["is", "this", "a", "document"], ["this", "is", "not", "a", "document"]]


**Building vocab**

In [11]:
#building vocab
words = set()
        #print("corpus : ", corpus)

for i in range(len(corpus)):
  #print("i: ", i)
  for j in range(len(corpus[i])):
    x = corpus[i][j-1].split(" ")
    for k in range(len(x)):
      if x[k] not in words and len(x[k])>=2:
        words.add(x[k])
print(words)


{'not', 'document', 'is', 'processed', 'this'}


**Word to index**

In [12]:
#word to index
word_to_index = dict.fromkeys(words)
count = 0

for i in words:
    word_to_index[i] = count
    count += 1

print(word_to_index)

{'not': 0, 'document': 1, 'is': 2, 'processed': 3, 'this': 4}


**IDF calculation**

In [13]:
idfs = dict.fromkeys(words, 0)
print("idfs map:", idfs)
total_num_of_terms_in_doc = len(words)

idfs map: {'not': 0, 'document': 0, 'is': 0, 'processed': 0, 'this': 0}


In [14]:
# IDF = how many documents have the word

number_of_docs_that_contain_the_word = dict.fromkeys(words,0)

for word in words:
  number_of_docs_with_the_word = 0
  for doc in corpus:
    if word in doc:
     #print(word, "is in", doc)
     number_of_docs_that_contain_the_word[word] += 1

print(number_of_docs_that_contain_the_word)

{'not': 1, 'document': 4, 'is': 4, 'processed': 1, 'this': 4}


In [15]:
#counting idfs

total_number_of_documents = len(corpus)
for word in words:
  #print("word=", word)
  print("total_number_of_documents =", total_number_of_documents, "number_of_docs_that_contain_the_word = ", number_of_docs_that_contain_the_word[word])
  log_val = math.log(((total_number_of_documents)/(1+number_of_docs_that_contain_the_word[word])), 10)
  print("log value =", log_val, "for word=", word)
  idf_ = 1.0 + log_val
  print("idf of", word,"is", idf_)
  idfs[word] = idf_

print(idfs)

total_number_of_documents = 4 number_of_docs_that_contain_the_word =  1
log value = 0.30102999566398114 for word= not
idf of not is 1.3010299956639813
total_number_of_documents = 4 number_of_docs_that_contain_the_word =  4
log value = -0.09691001300805638 for word= document
idf of document is 0.9030899869919436
total_number_of_documents = 4 number_of_docs_that_contain_the_word =  4
log value = -0.09691001300805638 for word= is
idf of is is 0.9030899869919436
total_number_of_documents = 4 number_of_docs_that_contain_the_word =  1
log value = 0.30102999566398114 for word= processed
idf of processed is 1.3010299956639813
total_number_of_documents = 4 number_of_docs_that_contain_the_word =  4
log value = -0.09691001300805638 for word= this
idf of this is 0.9030899869919436
{'not': 1.3010299956639813, 'document': 0.9030899869919436, 'is': 0.9030899869919436, 'processed': 1.3010299956639813, 'this': 0.9030899869919436}


In [16]:
for word in words:
  occurrence_of_word_in_whole_doc = 0
  for i in range(len(corpus)):
    #print("i=", i, ", corpus[i] length = ", len(corpus[i]), ", values = ", corpus[i])
    freq = corpus[i].count(word)
    #print("*", word, "* appears", freq, "times in", corpus[i])
    occurrence_of_word_in_whole_doc += freq
    #print("frequency of ", word, "in idfs :", idfs[word])
  print("occurrence of ", word, "in entire doc :", occurrence_of_word_in_whole_doc)
    #for j in range(len(corpus[i])):
      #print("i=", i, "j=", j, "corpus[i][j] length = ", len(corpus[i][j]))
      #if word == corpus[i][j]:

print(idfs)

occurrence of  not in entire doc : 1
occurrence of  document in entire doc : 4
occurrence of  is in entire doc : 4
occurrence of  processed in entire doc : 1
occurrence of  this in entire doc : 4
{'not': 1.3010299956639813, 'document': 0.9030899869919436, 'is': 0.9030899869919436, 'processed': 1.3010299956639813, 'this': 0.9030899869919436}


In [18]:
import math

# Sample corpus
corpus = [
    "this is a sample text",
    "another sample text for TF-IDF",
    "text mining and information retrieval",
]

# Tokenize and get the unique words
words = set(word for doc in corpus for word in doc.split())

# Initialize IDFs dictionary
idfs = dict.fromkeys(words, 0)

# Calculate IDF for each word
for word in words:
    for doc in corpus:
        if word in doc.split():
            idfs[word] += 1

total_docs = len(corpus)
for word in idfs:
    idfs[word] = 1.0 + math.log((1 + total_docs) / (1 + idfs[word]))

print("IDFs:", idfs)


IDFs: {'for': 1.6931471805599454, 'text': 1.0, 'another': 1.6931471805599454, 'retrieval': 1.6931471805599454, 'information': 1.6931471805599454, 'is': 1.6931471805599454, 'and': 1.6931471805599454, 'sample': 1.2876820724517808, 'a': 1.6931471805599454, 'mining': 1.6931471805599454, 'this': 1.6931471805599454, 'TF-IDF': 1.6931471805599454}


**TF-IDF calculation**

In [19]:
"""Method which accepts input_sent of the form : ["is", "this", "a", "sent"].
        Task:
            Implement the tfidf approach for the above input.
            This function should return a vector of counts for the given inputs.
            The output shape of the vector should be 1 x len(self.vocab)
            The output should be a numpy array
            Example:
                length of vocabulary = 10
                input_sent = ["is", "this", "a", "sent"]
                The method should return a vector of shape 1 x 10.
            Formula:
                TF(t) = (Number of times word t appears in a document) / (Total number of words in the document).
                IDF will already be calculated and stored in self.idfs_ variable.
        """

'Method which accepts input_sent of the form : ["is", "this", "a", "sent"].\n        Task:\n            Implement the tfidf approach for the above input. \n            This function should return a vector of counts for the given inputs.\n            The output shape of the vector should be 1 x len(self.vocab)\n            The output should be a numpy array\n            Example:\n                length of vocabulary = 10\n                input_sent = ["is", "this", "a", "sent"]\n                The method should return a vector of shape 1 x 10.\n            Formula:\n                TF(t) = (Number of times word t appears in a document) / (Total number of words in the document).\n                IDF will already be calculated and stored in self.idfs_ variable.\n        '

In [20]:
input_sent = ["is", "this", "a", "sent"]

vector_shape = (1, len(words))
print("vector shape =", vector_shape)
vector = np.zeros(vector_shape, dtype=float)
print(vector)

for word in input_sent:
  print("word =", word)
  if word not in words:
    input_index = np.where(input_sent == word)
    print("OOV error with the word: *", word, "* at ", input_index[0])
  elif word in word_to_index:
    print("idfs[word] for", word, "is", idfs[word])
    # TF(t) = (Number of times word t appears in a document) / (Total number of words in the document).
    tf = input_sent.count(word) / len(input_sent)
    #tf = idfs[word]/len(input_sent)
    print("Term Frequency for", word, "is", tf)
    #input_index = np.where(input_sent == word)
    #print(i, "found at input index =", input_index)
    indx = word_to_index[word] #getting index of i (word of input_sent) from our index map
    #print("the word is mapped at indx =", indx)
    #print("vector shape = ", vector.shape)
    #print("inside if", "vector[0][indx] =", vector[0][indx])
    vector[0][indx] = tf * idfs[word]

print(vector)
#vector = vector[0]

vector shape = (1, 12)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
word = is
idfs[word] for is is 1.6931471805599454
Term Frequency for is is 0.25
word = this
idfs[word] for this is 1.6931471805599454
Term Frequency for this is 0.25
word = a
word = sent
OOV error with the word: * sent * at  []
[[0.        0.        0.4232868 0.        0.4232868 0.        0.
  0.        0.        0.        0.        0.       ]]


  input_index = np.where(input_sent == word)


**Excess**

In [21]:


# for word in words:
#   count = 0
#  #index for 1st array of 2D corpus
#   print("i=", i)
#   while i<len(corpus):
#     j = 0
#     #print("i=", i, "j=", j, "corpus[i] length = ", len(corpus[i]))
#     while j<len(corpus[i]):
#       #print(corpus[i][j])
#       if corpus[i][j] in words:

#       j+=1
#     i+=1

x = math.log(1+len(corpus))
y = 1 + x
idf = 1.0 + x/y

# idf = 1.0 + math.log[(1+length of data) / (1+number of documents with the term)]
#**The formula in the code file is missing a bracket. This formula will ensure you won't run into ZeroDivisionErrors.**
#Reference: http://www.tfidf.com/
#Hint: Iterate over the vocab and check if word occurs in sentence. Count the occurence and store them in self.idfs_ dictionary.'''

**SkLearn Output**

In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import math

# Sample corpus as list of strings
corpus = ["this is a document",
          "this is a processed document",
          "is this a document",
          "this is not a document"]

# Custom TF-IDF implementation
class CustomTFIDF:
    def __init__(self, corpus):
        self.corpus = [doc.split() for doc in corpus]
        self.words = set(word for doc in self.corpus for word in doc)
        self.idfs = self.calculate_idfs()

    def calculate_idfs(self):
        idfs = dict.fromkeys(self.words, 0)
        total_docs = len(self.corpus)
        for word in self.words:
            doc_count = sum(1 for doc in self.corpus if word in doc)
            idfs[word] = 1.0 + math.log((1 + total_docs) / (1 + doc_count))
        return idfs

custom_tfidf = CustomTFIDF(corpus)
print("Custom IDFs:", custom_tfidf.idfs)

# sklearn implementation
vectorizer = CountVectorizer()
sklearn_counts = vectorizer.fit_transform(corpus)
tfidf_transformer = TfidfTransformer()
sklearn_tfidf = tfidf_transformer.fit_transform(sklearn_counts).toarray()

print("sklearn TF-IDF:\n", sklearn_tfidf)


Custom IDFs: {'not': 1.916290731874155, 'document': 1.0, 'is': 1.0, 'processed': 1.916290731874155, 'a': 1.0, 'this': 1.0}
sklearn TF-IDF:
 [[0.57735027 0.57735027 0.         0.         0.57735027]
 [0.38713857 0.38713857 0.         0.74187006 0.38713857]
 [0.57735027 0.57735027 0.         0.         0.57735027]
 [0.38713857 0.38713857 0.74187006 0.         0.38713857]]


In [25]:
import math
import numpy as np

class CustomTFIDF:
    def __init__(self, corpus):
        # Convert list of lists to list of strings if needed
        if isinstance(corpus[0], list):
            self.corpus = [' '.join(doc) for doc in corpus]
        else:
            self.corpus = corpus

        self.words = set(word for doc in self.corpus for word in doc.split())
        self.idfs = self.calculate_idfs()

    def calculate_idfs(self):
        idfs = dict.fromkeys(self.words, 0)
        total_docs = len(self.corpus)
        for word in self.words:
            doc_count = sum(1 for doc in self.corpus if word in doc.split())
            idfs[word] = 1.0 + math.log((1 + total_docs) / (1 + doc_count))
        return idfs

    def create_vector(self):
        tfidf_vectors = []
        for doc in self.corpus:
            tfidf_vector = []
            word_list = doc.split()
            word_count = len(word_list)
            for word in self.words:
                tf = word_list.count(word) / word_count
                idf = self.idfs[word]
                tfidf_vector.append(tf * idf)
            tfidf_vectors.append(tfidf_vector)
        return np.array(tfidf_vectors)

# Sample corpus
corpus = [["this", "is", "a", "document"],
          ["this", "is", "a", "processed", "document"],
          ["is", "this", "a", "document"],
          ["this", "is", "not", "a", "document"]]

custom_tfidf = CustomTFIDF(corpus)
custom_output = custom_tfidf.create_vector()
print("Custom TF-IDF Output:\n", custom_output)


Custom TF-IDF Output:
 [[0.         0.25       0.25       0.         0.25       0.25      ]
 [0.         0.2        0.2        0.38325815 0.2        0.2       ]
 [0.         0.25       0.25       0.         0.25       0.25      ]
 [0.38325815 0.2        0.2        0.         0.2        0.2       ]]


**Main Test**

In [26]:
#Do not modify code below this. If this modified you will be given straight away 0.
class testApproach:
    def __init__(self):
        self.original_corpus = ["this is a document", "this is a processed document", "is this a document", "This is not a document"]
        self.corpus = self.process(self.original_corpus)
        self.sklearn_bow = CountVectorizer()
        self.sklearn_transformer = TfidfTransformer()
        self.custom_tfidf = CustomTFIDF(self.corpus)

    def process(self, corpus):
        corpus = [x.lower() for x in corpus]
        return [x.split() for x in corpus]

    def testTFIDF(self):
        sklearn_output = self.sklearn_bow.fit_transform(self.original_corpus)
        sklearn_output = self.sklearn_transformer.fit_transform(sklearn_output).toarray()
        print("Sklearn Output = \n", sklearn_output, "\n")
        custom_output = self.custom_tfidf.create_vector()
        print("Our Output = \n", custom_output, "\n")
        try:
            is_correct = np.allclose(sklearn_output, custom_output)
        except:
            is_correct = False
        if is_correct is True:
            print(f"ID : {ID} | Name : {NAME} | ALL test cases passed.")
            print("===="*20)
        else:
            print(f"ID : {ID} | Name : {NAME} | ALL test cases Failed.")
            print("===="*20)

if __name__ == "__main__":
    tester = testApproach()
    tester.testTFIDF()

Sklearn Output = 
 [[0.57735027 0.57735027 0.         0.         0.57735027]
 [0.38713857 0.38713857 0.         0.74187006 0.38713857]
 [0.57735027 0.57735027 0.         0.         0.57735027]
 [0.38713857 0.38713857 0.74187006 0.         0.38713857]] 

Our Output = 
 [[0.         0.25       0.25       0.         0.25       0.25      ]
 [0.         0.2        0.2        0.38325815 0.2        0.2       ]
 [0.         0.25       0.25       0.         0.25       0.25      ]
 [0.38325815 0.2        0.2        0.         0.2        0.2       ]] 

ID : 1191609 | Name : Sama Samrin | ALL test cases Failed.
