In [5]:
class BoW:
    def __init__(self):
        # Initialize an empty vocabulary
        self.vocabulary = {}
        self.vocab_size = 0

    def __repr__(self):
            # Create a DataFrame to represent the BoW matrix with column names
            import pandas as pd
            columns = [word for word in self.vocabulary]
            df = pd.DataFrame(self.transform(documents), columns=columns)
            return df.to_string(index=False)
    
    def fit(self, documents):
        """
        Fit the BoW model on a list of documents.

        Args:
        documents (list): A list of text documents.

        Returns:
        None
        """
        # Step 1: Initialize an empty vocabulary here
        # Your code here: Build the vocabulary from the documents
        for doc in documents:
            for word in doc.split():
                if word not in self.vocabulary:
                    self.vocabulary[word] = self.vocab_size
                    self.vocab_size += 1
            

    def transform(self, documents):
        """
        Transform a list of documents into BoW representations.

        Args:
        documents (list): A list of text documents.

        Returns:
        bow_matrix (list): A list of BoW representations for each document.
        """
        # Step 2: Initialize an empty list for bow_matrix here
        bow_matrix = []

        # Your code here: Transform the documents into BoW representations
        for doc in documents:
            bow_vector = [0] * self.vocab_size
            for word in doc.split():
                if word in self.vocabulary:
                    bow_vector[self.vocabulary[word]] += 1
            bow_matrix.append(bow_vector)

        return bow_matrix  # Step 3: Return the BoW matrix

# Example usage:
documents = [
             "this is the first document", 
             "this document is the second document", 
             "and this is the third one",
             ]

bow_model = BoW()
bow_model.fit(documents)
bow_matrix = bow_model.transform(documents)
print(bow_model)


 this  is  the  first  document  second  and  third  one
    1   1    1      1         1       0    0      0    0
    1   1    1      0         2       1    0      0    0
    1   1    1      0         0       0    1      1    1


In [66]:
import math
import pandas as pd

class TFIDF:
    def __init__(self):
        # Initialize empty variables to store the vocabulary and document frequency
        self.vocabulary = {}
        self.doc_freq = {}
        self.total_docs = 0
        self.documents = []  # Store the documents

    def __repr__(self):
        # Create a DataFrame to represent the TF-IDF matrix with column names
        pd.options.display.float_format = '{:.6f}'.format  # Set the float format
        columns = [word for word in self.vocabulary]
        df = pd.DataFrame(self.transform(self.documents), columns=columns)
        return df.to_string(index=False)

    def fit(self, documents):
        """
        Fit the TF-IDF model on a list of documents.

        Args:
        documents (list): A list of text documents.

        Returns:
        None
        """
        self.documents = documents  # Store the documents
        self.total_docs = len(documents)
        # Step 1: Build the vocabulary and document frequency
        for doc in documents:
            words = set(doc.split())
            for word in words:
                if word in self.vocabulary:
                    self.doc_freq[word] += 1
                else:
                    self.vocabulary[word] = len(self.vocabulary)
                    self.doc_freq[word] = 1

    def transform(self, documents):
        """
        Transform a list of documents into TF-IDF representations.

        Args:
        documents (list): A list of text documents.

        Returns:
        tfidf_matrix (list): A list of TF-IDF representations for each document.
        """
        # Step 2: Initialize an empty list for tfidf_matrix
        tfidf_matrix = []

        # Your code here: Transform the documents into TF-IDF representations
        for doc in documents:
            tfidf_vector = [0.0] * len(self.vocabulary)
            word_freq = {}
            words = doc.split()
            for word in words:
                if word in word_freq:
                    word_freq[word] += 1
                else:
                    word_freq[word] = 1

            for word, freq in word_freq.items():
                if word in self.vocabulary:
                    tf = freq / len(words)
                    idf = math.log(self.total_docs / (1e-3 + self.doc_freq[word]))
                    tfidf_vector[self.vocabulary[word]] = tf * idf
            
            tfidf_matrix.append(tfidf_vector)

        return tfidf_matrix  # Step 3: Return the TF-IDF matrix

# Example usage:
documents = [
    "this is the first document",
    "this document is the second ",
    "and this is the third",
]

tfidf_model = TFIDF()
tfidf_model.fit(documents)
tfidf_matrix = tfidf_model.transform(documents)
print(tfidf_model)


   first      this        is       the  document   second      and    third
0.219523 -0.000067 -0.000067 -0.000067  0.080993 0.000000 0.000000 0.000000
0.000000 -0.000067 -0.000067 -0.000067  0.080993 0.219523 0.000000 0.000000
0.000000 -0.000067 -0.000067 -0.000067  0.000000 0.000000 0.219523 0.219523
