In [None]:
import numpy as np
import math
import string
import pandas as pd
from collections import Counter

documents = [
    "Virat Kohli plays a stunning cover drive for four runs.",
    "Rohit Sharma pulls it over midwicket, that's a huge six!",
    "Jasprit Bumrah bowls a perfect yorker at 145 km/h.",
    "What a brilliant spell by Bumrah, three wickets already!",
    "MS Dhoni finishes in style with a helicopter shot for six runs.",
    "The team needs 10 runs off the last over, it's a thriller!",
    "A fantastic diving catch at the boundary saves crucial runs.",
    "The crowd erupts as the batsman smashes another six!"
]


stopwords = set(["a", "an", "the", "and", "or", "for", "with", "to", "by", "in", "on", "at", "it's", "is", "as", "of", "this", "that"])

tokenized_docs = [
    [word.strip(string.punctuation).lower() for word in doc.split() if word.lower() not in stopwords]
    for doc in documents
]


def compute_tf(doc):
    word_counts = Counter(doc)
    total_words = len(doc)
    return {word: count / total_words for word, count in word_counts.items()}


def compute_idf(docs):
    num_docs = len(docs)
    idf = {}
    all_words = set(word for doc in docs for word in doc)
    
    for word in all_words:
        containing_docs = sum(1 for doc in docs if word in doc)
        idf[word] = math.log(num_docs / (1 + containing_docs))  
    
    return idf


def compute_tfidf(docs):
    tfidf = []
    idf = compute_idf(docs)
    
    for doc in docs:
        tf = compute_tf(doc)
        tfidf_doc = {word: tf[word] * idf[word] for word in tf}
        tfidf.append(tfidf_doc)
    
    return tfidf

tfidf_values = compute_tfidf(tokenized_docs)


df_tfidf = pd.DataFrame(tfidf_values).fillna(0)  
df_tfidf

Unnamed: 0,virat,kohli,plays,stunning,cover,drive,four,runs,rohit,sharma,...,diving,catch,boundary,saves,crucial,crowd,erupts,batsman,smashes,another
0,0.173287,0.173287,0.173287,0.173287,0.173287,0.173287,0.173287,0.05875,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154033,0.154033,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05875,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05875,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067143,0.0,0.0,...,0.198042,0.198042,0.198042,0.198042,0.198042,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.231049,0.231049,0.231049,0.231049,0.231049
