###*60009220131 Sayantan Mukherjee D2-2*

In [1]:
import nltk
from nltk.corpus import gutenberg
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import math

nltk.download('gutenberg')
shakespeare_corpus = [gutenberg.raw(file_id) for file_id in gutenberg.fileids() if 'shakespeare' in file_id.lower()]


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


In [6]:
nltk.download('shakespeare')
nltk.download('punkt_tab')
nltk.download('gutenberg')

[nltk_data] Downloading package shakespeare to /root/nltk_data...
[nltk_data]   Unzipping corpora/shakespeare.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [13]:
shakespeare_files = [file_id for file_id in gutenberg.fileids() if 'shakespeare' in file_id.lower()]
shakespeare_texts = [gutenberg.raw(file_id) for file_id in shakespeare_files]

In [14]:
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize into words
    words = word_tokenize(text)
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    filtered_words = [
        word for word in words
        if word not in stop_words and word not in string.punctuation
    ]
    return filtered_words

In [15]:
preprocessed_corpus = [preprocess(text) for text in shakespeare_texts]

In [16]:
word_counts = Counter()
for doc in preprocessed_corpus:
    word_counts.update(doc)

# Step 2: Extract Top 10 Words by Frequency
top_10_words = [word for word, _ in word_counts.most_common(10)]
print("Top 10 words By Occurence in Shakespeare context:")
print(top_10_words)

Top 10 words By Occurence in Shakespeare context:
["'d", 'haue', 'ham', 'thou', "'s", 'shall', 'lord', 'come', 'enter', 'king']


In [28]:
def compute_tf(doc):
    """
    Compute Term Frequency (TF) for a single document.
    TF(t, d) = (Number of times term t appears in document d) / (Total number of terms in document d)
    """
    total_words = len(doc)
    tf = {word: doc.count(word) / total_words for word in set(doc)}
    return tf

In [27]:
def compute_idf(corpus):
    """
    Compute Inverse Document Frequency (IDF) for the entire corpus.
    IDF(t) = log_e(Total number of documents / (1 + Number of documents containing term t))
    """
    num_docs = len(corpus)
    idf = {}
    for word in set(word for doc in corpus for word in doc):  # All unique words in the corpus
        doc_count = sum(1 for doc in corpus if word in doc)
        idf[word] = math.log(num_docs / (1 + doc_count))  # Add 1 to avoid division by zero
    return idf

In [26]:
def compute_tfidf(tf, idf):
    """
    Compute TF-IDF for a single document using precomputed TF and IDF.
    TF-IDF(t, d) = TF(t, d) * IDF(t)
    """
    tfidf = {word: tf_val * idf.get(word, 0) for word, tf_val in tf.items()}
    return tfidf

In [31]:
# Main Execution
if __name__ == "__main__":
    # Compute IDF for the entire corpus
    idf = compute_idf(preprocessed_corpus)
    print("\n--- Inverse Document Frequency (IDF) ---")
    for word, score in list(idf.items())[:10]:  # Show top 10 IDF scores
        print(f"{word}: {score:.4f}")
    print("\nTerm Frequency (TF):")
    for word, score in list(tf.items())[:10]:  # Show top 10 TF scores
      print(f"{word}: {score:.4f}")

    # Compute TF-IDF for each document
    tfidf_scores_per_doc = []
    for doc in preprocessed_corpus:
        tf = compute_tf(doc)
        tfidf = compute_tfidf(tf, idf)
        tfidf_scores_per_doc.append(tfidf)
        aggregated_tfidf = Counter()
    for tfidf in tfidf_scores_per_doc:
        aggregated_tfidf.update(tfidf)

    # Extract Top 10 Words by Aggregated TF-IDF Scores
    top_10_words = [word for word, _ in aggregated_tfidf.most_common(10)]

    # Display Results
    print("\n-----Top 10 Words by Aggregated TF-IDF Scores Across All Documents-----")
    for word in top_10_words:
        print(f"{word}: {aggregated_tfidf[word]:.4f}")


--- Inverse Document Frequency (IDF) ---
serue: -0.2877
image: 0.0000
thirtie: 0.0000
link: 0.4055
wife: -0.2877
haply: 0.4055
constrained: 0.4055
decius: 0.4055
expectansie: 0.4055
was't: -0.2877

Term Frequency (TF):
serue: 0.0002
glory: 0.0001
image: 0.0002
non-pareill: 0.0001
wife: 0.0033
houses: 0.0001
constrained: 0.0001
fitnesse: 0.0001
craues: 0.0001
layes: 0.0001

-----Top 10 Words by Aggregated TF-IDF Scores Across All Documents-----
ham: 0.0085
bru: 0.0055
macb: 0.0053
cassi: 0.0038
cassius: 0.0030
antony: 0.0027
hamlet: 0.0025
macbeth: 0.0024
hor: 0.0024
macd: 0.0023


###Summary:

In this experiment, we implemented the TF-IDF (Term Frequency-Inverse Document Frequency) model to analyze the importance of words in Shakespeare's works from the Gutenberg corpus.


We implemented three core functions to compute TF, IDF, and TF-IDF:

##Term Frequency (TF) :

###Formula:

TF(t,d)=
(Number of times term t appears in document d) / (Total number of terms in document d )
​

Purpose: Measures how frequently a word appears in a specific document.


##Inverse Document Frequency (IDF) :

###Formula:
IDF(t)=log( Total number of documents/
1 + Number of documents containing term t)

Purpose: Measures how rare or common a word is across the entire corpus.

##TF-IDF :

###Formula:
TF-IDF(t,d)=TF(t,d)×IDF(t)

Purpose: Combines TF and IDF to highlight words that are both frequent in a document and rare across the corpus.

###*Key Observations*
Words like "hamlet" had high TF-IDF scores because they were frequent in specific documents but rare across the corpus.

Common words like "king" and "queen" had lower IDF scores due to their widespread presence in multiple documents.