In [18]:
class BoW:
    def __init__(self):
        self.vocabulary = {} # a dictionary with format {words:word_position}
        self.vocab_size = 0

    def __repr__(self):
            import pandas as pd
            columns = [word for word in self.vocabulary]
            df = pd.DataFrame(self.transform(documents), columns=columns)
            return df.to_string(index=False)
    
    def fit(self, documents):
        """
        Fit the BoW model on a list of documents.

        Args:
        documents (list): A list of text documents.

        Returns:
        None
        """
        # Your code here: Build the vocabulary from the documents
        # iterate through documents, then through words (remember the function we used last course)
        # if the word is not part of the vocabulary, add it to the dictionary and update the vocabulary size
        # do not return anything
        for doc in documents:
            filtered_words = [word for word in doc.split() if word.isalnum()]
            for word in filtered_words:
                if word not in self.vocabulary:
                    self.vocabulary[word] = self.vocab_size
                    self.vocab_size += 1
         
            

    def transform(self, documents):
        """
        Transform a list of documents into BoW representations.

        Args:
        documents (list): A list of text documents.

        Returns:
        bow_matrix (list): A list of BoW representations for each document.
        """
        bow_matrix = []

        # Your code here: Transform the documents into BoW representations
        # for all documents, initialize an array bow_vector of zeros of the size of the vocabulary
        # iterate through every word in the document
        # if the word is in the vocabulary, increment bow_vector by 1 at the position that matches the word in the vocabulary
        # add the vector to bow_matrix
        
        for doc in documents:
            bow_vector = [0] * self.vocab_size
            filtered_words = [word for word in doc.split() if word.isalnum()]
            for word in filtered_words:
                if word in self.vocabulary:
                    bow_vector[self.vocabulary[word]] += 1
            bow_matrix.append(bow_vector)
            

        return bow_matrix  

# Example usage:
documents = [
             "this is the first document", 
             "this document is the second document", 
             "and this is the third one",
             ]

bow_model = BoW()
bow_model.fit(documents)
bow_matrix = bow_model.transform(documents)
print(bow_model)


 this  is  the  first  document  second  and  third  one
    1   1    1      1         1       0    0      0    0
    1   1    1      0         2       1    0      0    0
    1   1    1      0         0       0    1      1    1


In [24]:
import math
import pandas as pd

class TFIDF:
    def __init__(self):
        # Initialize empty variables to store the vocabulary and document frequency
        self.vocabulary = {}
        self.doc_freq = {}
        self.total_docs = 0

    def __repr__(self):
        # Create a DataFrame to represent the TF-IDF matrix with column names
        columns = [word for word in self.vocabulary]
        df = pd.DataFrame(self.transform(documents), columns=columns)
        return df.to_string(index=False)

    def fit(self, documents):
        """
        Fit the TF-IDF model on a list of documents.

        Args:
        documents (list): A list of text documents.

        Returns:
        None
        """
        # Step 1: Build the vocabulary and document frequency
        # for each document
        # get a list of unique words
        # iterate through all words, if a word is in the vocabulary, increment its frequency 
        # otherwise add it to the vocabulary and then increment its frequency
        # do not return anything
        for doc in documents:
            filtered_words = [word for word in doc.split() if word.isalnum()]
            for word in filtered_words:
                if word in self.vocabulary:
                    self.doc_freq[word] += 1
                else:
                    self.vocabulary[word] = len(self.vocabulary)
                    self.doc_freq[word] = 1
            self.total_docs += 1


    def transform(self, documents):
        """
        Transform a list of documents into TF-IDF representations.

        Args:
        documents (list): A list of text documents.

        Returns:
        tfidf_matrix (list): A list of TF-IDF representations for each document.
        """
        # Step 2: Initialize an empty list for tfidf_matrix
        tfidf_matrix = []

        # Your code here: Transform the documents into TF-IDF representations
        # for all docs
        # initialize the tfidf vector
        # create an empty word frequency dictionary
        # fill the tfidf vector, similar to bag of words
            # for all words and frequencies 
            # compute the tf and idf values and multiply them
            # append the tfidf vector to the matrix

        tfidf_matrix = []

        # Create a vocabulary list from the vocabulary dictionary
        vocabulary_list = list(self.vocabulary.keys())

        for doc in documents:
            tfidf_vector = [0] * len(self.vocabulary)
            word_freq = {}
            filtered_words = [word for word in doc.split() if word.isalnum()]
            for word in filtered_words:
                if word in word_freq:
                    word_freq[word] += 1
                else:
                    word_freq[word] = 1
            for word, freq in word_freq.items():
                if word in self.vocabulary:
                    tf = freq / len(filtered_words)
                    idf = math.log(self.total_docs / self.doc_freq[word])
                    tfidf_vector[self.vocabulary[word]] = tf * idf
            tfidf_matrix.append(tfidf_vector)
        return tfidf_matrix

# Example usage:
documents = [
    "this is the first document",
    "this document is the second document",
    "and this is the third one",
]

tfidf_model = TFIDF()
tfidf_model.fit(documents)
tfidf_matrix = tfidf_model.transform(documents)
print(tfidf_model)

 this  is  the    first  document   second      and    third      one
  0.0 0.0  0.0 0.219722       0.0 0.000000 0.000000 0.000000 0.000000
  0.0 0.0  0.0 0.000000       0.0 0.183102 0.000000 0.000000 0.000000
  0.0 0.0  0.0 0.000000       0.0 0.000000 0.183102 0.183102 0.183102
