# Spark Text Exercises
This notebook demonstrates Exercises 1-6. See `spark_text_lab.py` for a runnable script.

In [1]:
# Install requirements in your environment (run once)
# !pip install -r requirements.txt
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\landm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [None]:
# RDD word count example (Exercise 1)
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').appName('SparkTextLabNotebook').getOrCreate()
from spark_text_lab import rdd_word_counts, compare_stopwords
counts = rdd_word_counts(spark, 'sample_corpus.txt')
total, top_before, top_after, filtered = compare_stopwords(counts)
print('Top before:', top_before)
print('Top after:', top_after)

In [None]:
# DataFrame examples (Exercise 2)
from spark_text_lab import df_from_counts, weighted_avg_word_length, ten_longest, filter_count_ge
df, total_count = df_from_counts(spark, counts)
print('Weighted avg length:', weighted_avg_word_length(df, total_count))
print('10 longest:', ten_longest(df))
filtered_df, share = filter_count_ge(df, 2)
print('Share for count>=2:', share)

## MongoDB example (Exercise 3)
Update `MONGO_URI` below to your MongoDB connection string. The cell shows insertion and a sample query.

In [None]:
MONGO_URI = 'mongodb://localhost:27017'  # update as needed
from spark_text_lab import mongo_store_counts, query_mongo_by_length
# Uncomment to run if MongoDB is available
# coll = mongo_store_counts(counts, MONGO_URI)
# print('Inserted to:', coll.full_name)
# print('Words length>=7:', query_mongo_by_length(MONGO_URI, 7)[:20])

In [None]:
# Bigrams (Exercise 4)
from spark_text_lab import top_bigrams
print('Top bigrams:', top_bigrams(spark, 'sample_corpus.txt', 20))

In [None]:
# TF-IDF (Exercise 5)
from spark_text_lab import compute_tfidf
res = compute_tfidf(spark, 'sample_corpus.txt')
res.show(3, truncate=False)

In [None]:
# Per-file and global counts (Exercise 6)
# Use a folder path containing text files, e.g., './texts'
# per_file, global_counts_rdd = per_file_and_global_counts(spark, './texts')
# print('Per-file sample:', per_file[:2])
# print('Global top:', global_counts_rdd.takeOrdered(20, key=lambda x:-x[1]))