# Importing Libraries

In [1]:
from pyspark import SparkContext
import string
import shutil
from pathlib import Path
from typing import Set

# Helper Functions

In [2]:
def clean_word(word: str) -> str:
    """
    Removes punctuation and convert word to lowercase.
    """
    return word.translate(str.maketrans("", "", string.punctuation)).lower()


def save_output(rdd, output_path: str) -> None:
    """
    Saves RDD to output path. If the directory exists, deletes it first.
    """
    path = Path(output_path)
    if path.exists() and path.is_dir():
        print(f"[INFO] Output path '{output_path}' exists. Deleting it first.")
        shutil.rmtree(path)
    rdd.saveAsTextFile(output_path)
    print(f"[INFO] Saved RDD to '{output_path}'")


def print_top_n(rdd, n: int = 25) -> None:
    """
    Prints the top `n` (default is 25) elements from the RDD.
    """
    top_items = rdd.take(n)
    for word, count in top_items:
        print(f"{word}: {count}")

# Word Count

## Initialize `SparkContext`

In [3]:
sc = SparkContext("local", "WordCount")

## Define custom stop words

In [4]:
stop_words: Set[str] = set(
    [
        "the",
        "and",
        "a",
        "an",
        "to",
        "in",
        "is",
        "it",
        "of",
        "that",
        "this",
        "on",
        "was",
        "with",
        "as",
        "for",
        "but",
        "by",
        "be",
        "at",
        "are",
        "or",
        "he",
        "she",
        "i",
        "you",
        "they",
        "we",
        "his",
        "her",
        "their",
        "my",
        "me",
        "your",
        "has",
        "have",
        "had",
        "will",
        "would",
        "can",
        "could",
        "should",
        "do",
        "does",
        "did",
    ]
)

## Read input files

In [5]:
book1 = sc.textFile("book1.txt")
book2 = sc.textFile("book2.txt")
full_text = book1.union(book2)

## Task 1.1 - Basic Word Count

In [6]:
basic_counts = (
    full_text
    .flatMap(lambda line: line.split())
    .map(lambda word: (word, 1))
    .reduceByKey(lambda a, b: a + b)
)
save_output(basic_counts, "output_1.txt")

[INFO] Output path 'output_1.txt' exists. Deleting it first.
[INFO] Saved RDD to 'output_1.txt'


## Task 1.2 - Extended Word Count

In [7]:
extended_counts = (
    full_text
    .flatMap(lambda line: line.split())
    .map(lambda word: clean_word(word))
    .filter(lambda word: word and word not in stop_words)
    .map(lambda word: (word, 1))
    .reduceByKey(lambda a, b: a + b)
    .sortBy(lambda x: x[1], ascending=False)
)
save_output(extended_counts, "output_1_extended.txt")

[INFO] Output path 'output_1_extended.txt' exists. Deleting it first.
[INFO] Saved RDD to 'output_1_extended.txt'


## Task 1.4 - Top 25 from book1.txt only (extended)

In [8]:
book1_extended = (
    book1
    .flatMap(lambda line: line.split())
    .map(lambda word: clean_word(word))
    .filter(lambda word: word and word not in stop_words)
    .map(lambda word: (word, 1))
    .reduceByKey(lambda a, b: a + b)
    .sortBy(lambda x: x[1], ascending=False)
)

print("\nTop 25 words from book1.txt (extended):")
print_top_n(book1_extended, 25)


Top 25 words from book1.txt (extended):
not: 1505
mr: 791
him: 725
all: 640
elizabeth: 599
so: 593
were: 566
which: 565
been: 534
from: 519
very: 492
no: 478
what: 452
them: 420
said: 405
such: 398
when: 370
darcy: 358
mrs: 349
there: 347
if: 341
more: 335
much: 330
must: 323
am: 322


## Stop `SparkContext`

In [9]:
sc.stop()