In [90]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [91]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [92]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

In [93]:
import findspark
findspark.init()

In [112]:
from pyspark import SparkConf, SparkContext
import json
import re
from datetime import datetime

In [95]:
if 'sc' in globals():
    sc.stop()

In [96]:
# Initialize Spark
conf = SparkConf().setAppName("WordFrequency")
sc = SparkContext(conf=conf)

In [97]:
# Task 1: Read the data from the 'data' directory using textFile
input_data = sc.textFile("/content/drive/MyDrive/data")

In [98]:
# Task 2: Take only the "text" part of each article, convert it to lowercase, and count word frequencies
word_counts = (
    input_data
    .map(lambda line: json.loads(line))  # Parse each line as a JSON object
    .map(lambda json_obj: json_obj.get("text", ""))  # Extract the "text" part of the JSON object
    .flatMap(lambda text: text.lower().split())  # Convert text to lowercase and split into words
    .map(lambda word: (word, 1))  # Map each word to (word, 1) for counting
    .reduceByKey(lambda a, b: a + b)  # Count word frequencies
)

In [99]:
# Task 3: Remove words with a frequency less than 10
filtered_word_counts = word_counts.filter(lambda x: x[1] >= 10)

In [102]:
# Task 4-1: Total size of the output data
total_size = filtered_word_counts.map(lambda x: x[1]).sum()

In [103]:
# Task 4-2: Frequency of specific words
word_frequencies = filtered_word_counts.filter(lambda x: x[0] in ['congress', 'london', 'washington', 'football']).collectAsMap()


In [104]:
# Task 4-3: Word with maximum frequency for each month
month_max_word = (
    input_data
    .map(lambda line: line.split('|'))
    .filter(lambda parts: len(parts) == 3)
    .map(lambda parts: (parts[0][:7], parts[2].lower()))
    .flatMap(lambda date_text: date_text[1].split())
    .map(lambda word: (word, 1))
    .reduceByKey(lambda a, b: a + b)
    .reduceByKey(lambda a, b: max(a, b))
    .collectAsMap()
)


In [108]:
# Task 4-4: List of words appeared on '2012-09-01' but not on '2012-08-01'
september_words = (
    input_data
    .filter(lambda line: line.startswith('2012-09-01'))
    .flatMap(lambda line: line.split("|")[2].lower().split())
    .distinct()
)

august_words = (
    input_data
    .filter(lambda line: line.startswith('2012-08-01'))
    .flatMap(lambda line: line.split("|")[2].lower().split())
    .distinct()
)

unique_september_words = september_words.subtract(august_words).collect()

In [109]:
# Task 4-5: Frequency of the word 'monsoon' for all months
monsoon_frequency = (
    input_data
    .filter(lambda line: 'monsoon' in line.lower())
    .count()
)


In [118]:
# Task 4-5: Frequency of the word 'monsoon' for all months
# Function to extract the month from a date string
def extract_month(date_string):
    date_obj = datetime.strptime(date_string, "%Y-%m-%d")
    return date_obj.strftime("%B")  # Format as full month name

monsoon_frequency_by_month = (
    input_data
    .filter(lambda line: 'monsoon' in line.lower())
    .map(lambda line: (extract_month(re.search(r'"date": "(\d{4}-\d{2}-\d{2})"', line).group(1)), 1))
    .reduceByKey(lambda a, b: a + b)
    .sortByKey()  # Sort the results by month
)

# Collect the results into a list of tuples
monsoon_frequency_by_month = monsoon_frequency_by_month.collect()

# Print the results in order
for month, count in monsoon_frequency_by_month:
    print(f"{month}, Monsoon Count: {count}")

April, Monsoon Count: 141
August, Monsoon Count: 477
December, Monsoon Count: 96
February, Monsoon Count: 78
January, Monsoon Count: 71
July, Monsoon Count: 716
June, Monsoon Count: 559
March, Monsoon Count: 100
May, Monsoon Count: 286
November, Monsoon Count: 144
October, Monsoon Count: 207
September, Monsoon Count: 341


In [119]:
# Print the results
print("Task 4-1: Total size of the output data:", total_size)
print("Task 4-2: Frequency of specific words:", word_frequencies)
print("Task 4-3: Word with maximum frequency for each month:", month_max_word)
print("Task 4-4: List of words appeared on '2012-09-01' but not on '2012-08-01':", unique_september_words)



Task 4-1: Total size of the output data: 47049002
Task 4-2: Frequency of specific words: {'congress': 28093, 'washington': 546, 'london': 2389, 'football': 1387}
Task 4-3: Word with maximum frequency for each month: {'organize': 1, 'also': 45, 'adding:': 1, 'less': 5, 'onto': 2, 'play-offs.\\"i': 1, 'bollinger': 2, '&amp;': 8, 'venue': 5, 'discipline': 1, 'marchant': 2, 'cheer': 3, 'bhatia.mathews': 1, '72': 1, '(daag)': 1, 'state': 7, 'away,': 1, 'ministers': 1, 'chamber': 1, 'beckoning': 1, 'costly': 1, 'permissions': 1, 'lankans': 1, 'its': 6, 'modern': 2, 'now': 22, 'least': 7, "chandila's": 2, 'so': 22, 'cheaply.mayank': 1, 'immaculate': 1, 'resurrected': 1, 'passage.': 1, 'visiting': 1, 'setbacks,': 1, 'accounted': 6, 'cases:': 1, 'casesuttar': 1, 'stand-in': 3, 'riders': 2, '\\"appanna': 1, 'fl': 1, 'whole': 2, 'separate': 2, 'happening.': 1, '97%,': 1, 'hammering': 1, 'presence': 2, 'girls,youth': 1, 'expertwaste-to-energy': 1, 'metamorphosed': 1, 'compared': 1, 'six': 47, 'sat