# Week 7
Choose any corpus available on the internet freely. For the corpus, for each document, count how many times each stop word occurs and find out which are the most frequently occurring stop words. Further, calculate the term frequency and inverse document frequency as The motivation behind this is basically to find out how important a document is to a given query. For e.g.: If the query is say: “The brown crow”. “The” is less important. “Brown” and “crow” are relatively more important. Since “the” is a more common word, its tf will be high. Hence we multiply it by idf, by knowing how common it is to reduce its weight.

In [9]:
import nltk
import string
from nltk.corpus import stopwords, gutenberg
from collections import Counter
import math

nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\bansa\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bansa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bansa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
texts = gutenberg.sents('austen-emma.txt')

flat_text = [word.lower() for sentence in texts for word in sentence]
print(f"First 50 words: {flat_text[:50]}")

First 50 words: ['[', 'emma', 'by', 'jane', 'austen', '1816', ']', 'volume', 'i', 'chapter', 'i', 'emma', 'woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', ',', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', ';', 'and', 'had', 'lived', 'nearly', 'twenty', '-', 'one', 'years', 'in', 'the']


In [11]:
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

In [None]:
filtered_text = [
    word for word in flat_text if word not in stop_words and word not in punctuation]
word_frequency = Counter(filtered_text)

In [None]:
def compute_tf(corpus):
    """Compute Term Frequency for the corpus."""
    tf_dict = Counter(corpus)
    total_terms = len(corpus)
    for word in tf_dict:
        tf_dict[word] = tf_dict[word] / total_terms
    return tf_dict


def compute_idf(corpus, documents):
    """Compute Inverse Document Frequency."""
    idf_dict = {}
    total_docs = len(documents)
    for word in set(corpus):
        doc_count = sum(1 for doc in documents if word in doc)
        idf_dict[word] = math.log(total_docs / (1 + doc_count))
    return idf_dict

In [None]:
tf = compute_tf(filtered_text)
documents = [filtered_text]

idf = compute_idf(filtered_text, documents)

In [None]:
stop_word_counts = Counter({word: flat_text.count(word)
                           for word in stop_words if word in flat_text})

print("\nStop word counts in the original text (before filtering):")
for stop_word, count in stop_word_counts.items():
    print(f"{stop_word}: {count}")

most_common_stopwords = stop_word_counts.most_common(10)
print("\nMost common stop words in the filtered text:")
for stop_word, count in most_common_stopwords:
    print(f"{stop_word}: {count}")

print("\nFirst 10 words in the text with their raw frequency, TF, and IDF:")

unique_words = list(set(filtered_text))[:10]

for word in unique_words:
    print(f"\nWord: {word}")
    print(f"Raw Frequency: {word_frequency[word]}")
    print(f"Term Frequency (TF): {tf.get(word, 0)}")
    print(f"Inverse Document Frequency (IDF): {idf.get(word, 0)}")


Stop word counts in the original text (before filtering):
a: 3129
during: 17
to: 5239
he: 1806
then: 169
been: 759
can: 284
ours: 7
no: 742
such: 489
itself: 15
after: 161
further: 1
there: 549
don: 16
that: 1806
i: 3178
but: 1441
whom: 73
doing: 45
what: 536
re: 2
yours: 5
ourselves: 17
himself: 146
if: 485
s: 935
and: 4896
at: 1031
between: 73
own: 301
our: 97
themselves: 40
who: 294
not: 2140
should: 369
have: 1320
being: 358
with: 1217
very: 1202
some: 262
herself: 279
has: 250
before: 250
were: 600
was: 2398
about: 249
had: 1624
the: 5201
down: 70
each: 46
your: 364
when: 363
on: 692
all: 845
too: 254
now: 309
am: 425
by: 571
d: 12
for: 1347
against: 46
it: 2528
than: 415
here: 154
m: 2
be: 1975
ma: 12
an: 464
both: 85
we: 349
nor: 64
does: 130
off: 99
where: 87
only: 341
is: 1240
shan: 1
under: 63
him: 759
out: 212
o: 8
those: 95
just: 165
them: 432
her: 2469
theirs: 1
same: 102
once: 83
do: 640
in: 2188
more: 467
will: 570
my: 728
myself: 103
hers: 20
which: 556
from: 546
few: 