# TF_IDF from Scratch

In [3]:
import numpy as np
import pandas as pd

In [4]:
text_data = ['Good Movie', 'Bad Movie', 'Good Bad Movie']

In [5]:
# Sentences and Word Tokenization
word_data = [sentence.split(' ') for sentence in text_data]

# Finding the Vocabulary
vocab = list(set([word for sentence in word_data for word in sentence]))

In [6]:


# Finding Term Frequency
tf_data = []
for sentence in word_data:
    tf_sentence = []
    for word in vocab:
        # Calculate Term Frequency (TF) for each word in the sentence
        tf_sentence.append(sentence.count(word) / len(sentence))
    tf_data.append(tf_sentence)

# Finding Inverse Document Frequency
n_documents = len(text_data)
idf_data = []

for word in vocab:
    n_appearances = 0
    for sentence in word_data:
        # Count the number of documents where the word appears
        if word in sentence:
            n_appearances += 1

    # Calculate Inverse Document Frequency (IDF) for each word
    idf = np.log(n_documents / n_appearances)
    idf_data.append(idf)

# Finding TF-IDF for each sentence
tfidf_data = []
for tf_sentence in tf_data:
    tfidf_sentence = []
    for tf, idf in zip(tf_sentence, idf_data):
        # Calculate TF-IDF for each word in the sentence
        tfidf_sentence.append(tf * idf)
    tfidf_data.append(tfidf_sentence)

# Create a DataFrame to display TF-IDF values for each word in each sentence
df = pd.DataFrame(tfidf_data, columns=vocab)
df['full_sent'] = text_data

# Display the resulting DataFrame
print(df)

       Good       Bad  Movie       full_sent
0  0.202733  0.000000    0.0      Good Movie
1  0.000000  0.202733    0.0       Bad Movie
2  0.135155  0.135155    0.0  Good Bad Movie


# Via Sklearn

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Input text data
text_data = ["good movie", "bad movie", "good bad movie"]

# Create a TfidfVectorizer instance
vec = TfidfVectorizer()

# Fit the vectorizer to the text data and transform the data into TF-IDF features
vec.fit(text_data)

# Convert the TF-IDF features to a pandas DataFrame
df = pd.DataFrame(vec.transform(text_data).toarray(), columns=vec.get_feature_names_out())

# Add the original sentences as a column in the DataFrame
df['full_sent'] = text_data

# Display the resulting DataFrame
print(df)

        bad      good     movie       full_sent
0  0.000000  0.789807  0.613356      good movie
1  0.789807  0.000000  0.613356       bad movie
2  0.619805  0.619805  0.481334  good bad movie
