# Part 1: RDDs

In [None]:
import json
import os
import re

from pyspark.sql import SparkSession

In [None]:
# Create or retrieve a SparkSession
spark = SparkSession.builder.appName("ChiSquaredRDD").getOrCreate()
sc = spark.sparkContext

In [None]:
# Extract the 8-digit matriculation number with preceding e from the absolute path of the current working directory
matriculation_number = re.search(r"/e\d{8}/", os.getcwd()).group(0)[1:-1]

# Put the stopwords file in the HDFS home directory for the current user (e.g. /user/e12345678/stopwords.txt)
# Only do this if it does not already exist in the HDFS home directory
if os.system("hdfs dfs -test -e /user/%s/stopwords.txt" % matriculation_number):
    os.system("hdfs dfs -put stopwords.txt /user/%s/stopwords.txt" % matriculation_number)

# Load the review file as an RDD
# review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviewscombined.json"
review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json"
input_rdd = sc.textFile(review_path)

# Load the stopword file as a set
stopwords_path = "hdfs:///user/%s/stopwords.txt" % matriculation_number
stopwords = set(sc.textFile(stopwords_path).collect())

In [None]:
# Parse the JSON strings and extract the category and review text
category_review_rdd = input_rdd \
    .map(lambda json_str: json.loads(json_str)) \
    .map(lambda json_obj: (json_obj['category'], json_obj['reviewText']))

In [None]:
# Compute the total number of documents
review_count = category_review_rdd.count()

In [None]:
# Compute the number of documents in each category
category_counts = category_review_rdd \
    .map(lambda pair: (pair[0], 1)) \
    .reduceByKey(lambda x, y: x + y) \
    .collectAsMap()

In [None]:
# Tokenize the reviewText of each review based on a regex
pattern = re.compile(r"[^a-zA-Z<>^|]+")

# Compute the number of occurrences of each term in each category across all reviews
term_category_occurrences_rdd = category_review_rdd \
    .flatMap(
    lambda pair: (((term.lower(), pair[0]), 1) for term in set(term.lower() for term in pattern.split(pair[1])) if
                  term not in stopwords and len(term) >= 2)) \
    .reduceByKey(lambda x, y: x + y)

In [None]:
# Compute the number of occurrences of each term across all reviews
term_occurrences_rdd = term_category_occurrences_rdd \
    .map(lambda pair: (pair[0][0], pair[1])) \
    .reduceByKey(lambda x, y: x + y)

In [None]:
# combine the number of occurrences of each term in each category with the number of occurrences of each term across all reviews
# (term, category) -> (term_occurrences, term_category_occurrences)
combined_rdd = term_category_occurrences_rdd \
    .map(lambda pair: (pair[0][0], (pair[0][1], pair[1]))) \
    .join(term_occurrences_rdd) \
    .map(lambda pair: ((pair[0], pair[1][0][0]), (pair[1][1], pair[1][0][1])))

In [None]:
# Compute the chi-squared value for a given term and category
def calculate_chi_square(category, term_occurrences, term_category_occurrences):
    a = term_category_occurrences
    b = term_occurrences - a
    c = category_counts[category] - a
    d = review_count - a - b - c
    return review_count * (a * d - b * c) ** 2 / ((a + b) * (a + c) * (b + d) * (c + d))

In [None]:
# Compute the chi-squared value for each unique term and category pair
# (category) -> [(term, chi-square)]
term_category_chi_squared_rdd = combined_rdd \
    .map(lambda pair: (pair[0][1], (pair[0][0], calculate_chi_square(pair[0][1], pair[1][0], pair[1][1])))) \
    .groupByKey()

In [None]:
# Select the top 75 tokens with the highest chi-square value for each category and sort them in ascending order
# (category) -> [(term, chi-square)]
chi_square_rdd = term_category_chi_squared_rdd \
    .map(lambda pair: (pair[0], sorted(pair[1], key=lambda x: x[1], reverse=True)[:75])) \
    .sortByKey()

In [None]:
# Select all unique tokens from the top 75 tokens with the highest chi-square value for each category
tokens = chi_square_rdd \
    .flatMap(lambda pair: (token for token, chi_square in pair[1])) \
    .distinct() \
    .collect()

# Sort the tokens in alphabetical order
tokens.sort()

In [None]:
# Save the top 75 tokens with the highest chi-square value for each category to a file in the local file system
# in the format: "<category> term1:chi_squared1 term2:chi_squared2 ... term75:chi_squared75" for each line
# and append the list of tokens to the end of the file
with open("chi_squared.txt", "w") as file:
    for pair in chi_square_rdd.collect():
        file.write("<%s>" % pair[0] + " ")
        for token, chi_square in pair[1]:
            file.write("%s:%f" % (token, chi_square) + " ")
        file.write("\n")
    file.write(" ".join(tokens) + "\n")

In [None]:
sc.stop()