<a href="https://colab.research.google.com/github/SharveshSp04/sxs210399-sharvesh-subapalaniraj/blob/main/q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Q1: Word Count Analysis - AUTO EXECUTION
import re
from pyspark.sql import SparkSession

def preprocess_text(text):
    """Preprocess text by converting to lowercase and removing non-alphanumeric characters"""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.strip()

def create_mock_q1_data():
    """Create mock text data for Q1 testing"""
    mock_text = """
    frankenstein monster life frankenstein monster
    life is good the monster in frankenstein
    life of monster frankenstein story
    big data spark hadoop spark data
    hello world hello spark big data
    frankenstein monster life data spark
    """
    # Save mock data to file
    with open("q1_dataset.txt", "w") as f:
        f.write(mock_text)
    print("Created mock Q1 dataset")

def q1_solution():
    print("\n" + "="*50)
    print("EXECUTING QUESTION 1")
    print("="*50)

    # Initialize SparkSession and SparkContext
    spark = SparkSession.builder.appName("WordCount").getOrCreate()
    sc = spark.sparkContext

    # Create mock data if real dataset doesn't exist
    try:
        text_rdd = sc.textFile("q1_dataset.txt")
        if text_rdd.count() == 0:
            create_mock_q1_data()
            text_rdd = sc.textFile("q1_dataset.txt")
    except:
        create_mock_q1_data()
        text_rdd = sc.textFile("q1_dataset.txt")

    # A. Count occurrences for each word
    words_rdd = text_rdd.flatMap(lambda line: preprocess_text(line).split())
    word_counts = words_rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

    # Save output for part A
    word_counts.coalesce(1).saveAsTextFile("q1a_word_counts_output")

    # B. Find specific words count
    target_words = ["frankenstein", "monster", "life"]
    target_words_set = set(target_words)

    specific_counts = words_rdd.filter(lambda word: word in target_words_set) \
                              .map(lambda word: (word, 1)) \
                              .reduceByKey(lambda a, b: a + b)

    # Save output for part B
    specific_counts.coalesce(1).saveAsTextFile("q1b_specific_words_output")

    # C. Find top 20 words with highest occurrences
    top_20_words = word_counts.sortBy(lambda x: x[1], ascending=False).take(20)

    # Convert to RDD and save
    top_20_rdd = sc.parallelize(top_20_words)
    top_20_rdd.coalesce(1).saveAsTextFile("q1c_top_20_words_output")

    # Print results for verification
    print("=== Q1 Results ===")
    print("\nA. All word counts:")
    for word, count in word_counts.collect():
        print(f"{word}: {count}")

    print("\nB. Specific words count:")
    specific_results = specific_counts.collect()
    for word, count in specific_results:
        print(f"{word}: {count}")

    print("\nC. Top 20 words:")
    for i, (word, count) in enumerate(top_20_words, 1):
        print(f"{i:2d}. {word}: {count}")

    return word_counts, specific_counts, top_20_words

# AUTO EXECUTE Q1
q1_word_counts, q1_specific_counts, q1_top_20 = q1_solution()


EXECUTING QUESTION 1
Created mock Q1 dataset
=== Q1 Results ===

A. All word counts:
monster: 5
good: 1
of: 1
story: 1
big: 2
hadoop: 1
hello: 2
world: 1
frankenstein: 5
life: 4
is: 1
the: 1
in: 1
data: 4
spark: 4

B. Specific words count:
monster: 5
frankenstein: 5
life: 4

C. Top 20 words:
 1. monster: 5
 2. frankenstein: 5
 3. life: 4
 4. data: 4
 5. spark: 4
 6. big: 2
 7. hello: 2
 8. good: 1
 9. of: 1
10. story: 1
11. hadoop: 1
12. world: 1
13. is: 1
14. the: 1
15. in: 1
