# Part 1: RDDs

In [1]:
import json
import re

from pyspark import SparkConf
from pyspark.sql import SparkSession

In [2]:
# Create a custom Spark config to maximize performance:
conf = (
    SparkConf()
    .set("spark.driver.memory", "4g")
    .set("spark.executor.memory", "7392m")
    .set("spark.parallelism", "4")
)
spark = (
    SparkSession.builder
    .appName("ChiSquaredRDD")
    .config(conf=conf)
    .getOrCreate()
)

sc = spark.sparkContext

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
23/06/01 18:28:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/01 18:28:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/06/01 18:28:57 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/06/01 18:28:57 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/06/01 18:28:57 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
23/06/01 18:28:5

## Load reviews as RDD

In [3]:
%%time

# review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviewscombined.json"
review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json"
input_rdd = sc.textFile(review_path)

CPU times: user 2.09 ms, sys: 978 µs, total: 3.06 ms
Wall time: 564 ms


## Obtain stopwords

In [4]:
%%time

# Load stopwords into (local) memory (Note: file contains duplicates, so convert to set)
stopwords = set(open("stopwords.txt").read().splitlines())

CPU times: user 1.86 ms, sys: 0 ns, total: 1.86 ms
Wall time: 1.72 ms


Note: local variables like this one will be automatically broadcast to all data nodes if accessed in any RDD transformation

## Compute the total number of documents and number of documents per category

For this step, we can just count the number of documents by category and then sum up the number of documents per category to get the total number of documents.

We create an RDD for the category tag of each review (parsing the input JSON string and extracting the `category` attribute) and then compute the counts by calling `countByValue()` on it:

In [5]:
%%time

category_rdd = input_rdd \
    .map(lambda input_string: json.loads(input_string)['category'])

CPU times: user 835 µs, sys: 48 µs, total: 883 µs
Wall time: 24.5 ms


In [6]:
%%time

category_counts = category_rdd.countByValue()
category_counts

[Stage 0:>                                                          (0 + 2) / 2]

CPU times: user 8.34 ms, sys: 4.84 ms, total: 13.2 ms
Wall time: 3.55 s


                                                                                

defaultdict(int,
            {'Patio_Lawn_and_Garde': 994,
             'Apps_for_Android': 2638,
             'Book': 22507,
             'Toys_and_Game': 2253,
             'Office_Product': 1243,
             'Digital_Music': 836,
             'Sports_and_Outdoor': 3269,
             'Automotive': 1374,
             'Beauty': 2023,
             'Musical_Instrument': 500,
             'CDs_and_Vinyl': 3749,
             'Kindle_Store': 3205,
             'Clothing_Shoes_and_Jewelry': 5749,
             'Electronic': 7825,
             'Home_and_Kitche': 4254,
             'Cell_Phones_and_Accessorie': 3447,
             'Pet_Supplie': 1235,
             'Movies_and_TV': 4607,
             'Baby': 916,
             'Tools_and_Home_Improvement': 1926,
             'Grocery_and_Gourmet_Food': 1297,
             'Health_and_Personal_Care': 2982})

This dict is minuscule (one key-value pair for each category) and will easily fit into memory on the data-nodes.

In [7]:
%%time

review_count = sum(category_counts.values())
review_count

CPU times: user 22 µs, sys: 0 ns, total: 22 µs
Wall time: 25.5 µs


78829

## Obtain the number of occasions of each term by category

First, define an RDD for extracting `category` and `reviewText` from each review JSON string:

In [8]:
%%time

category_review_rdd = input_rdd \
    .map(lambda json_str: json.loads(json_str)) \
    .map(lambda json_obj: (json_obj['category'], json_obj['reviewText']))

CPU times: user 324 µs, sys: 355 µs, total: 679 µs
Wall time: 579 µs


Then, tokenize the review texts and remove stopwords to obtain the terms. For each unique term appearing in each document, output a tuple of the form $$((term, category), 1)$$ where $term$ is the respective term and $category$ is the category the document is associated with

In [9]:
%%time

# define pattern for splitting/tokenizing
pattern = re.compile(r"[^a-zA-Z<>^|]+")


def map_review_data(pair):
    category, review_text = pair
    # obtain set of unique(!) terms for each document via tokenization followed by stopword removal
    terms = [
        t for t in set(token.lower() for token in pattern.split(review_text)) if t not in stopwords and len(t) >= 2
    ]
    return [((term, category), 1) for term in terms]


def remap(pair):
    term_and_cat, count = pair
    term, cat = term_and_cat
    return term, (cat, count)


term_cat_occ_rdd = category_review_rdd \
    .flatMap(map_review_data) \
    .reduceByKey(lambda x, y: x + y) \
    .map(remap) \
    .groupByKey()

CPU times: user 12.6 ms, sys: 671 µs, total: 13.3 ms
Wall time: 70.1 ms


## Calculate Chi-square

We now have all the data to compute the $\chi^2$ metric

In [10]:
def calculate_chi_square(pair):
    term, term_counts_for_categories = pair

    # Use to retrieve number of documents containing term for a particular category
    doc_count_for_cat = dict(term_counts_for_categories)

    # total number of documents containing the term
    total_doc_count_for_term = sum(doc_count_for_cat.values())

    term_and_cat_chi_squared = []

    for category, count in doc_count_for_cat.items():
        # number of documents in c which contain t
        a = count
        # number of documents not in c which contain t
        b = total_doc_count_for_term - a
        # number of documents in c without t
        c = category_counts[category] - a
        # number of documents not in c without t
        d = review_count - a - b - c
        term_and_cat_chi_squared.append(
            (
                category,
                (term, review_count * (a * d - b * c) ** 2 / ((a + b) * (a + c) * (b + d) * (c + d)))
            )
        )
    return term_and_cat_chi_squared

In [11]:
%%time

# Compute the chi-squared value for each unique term and category pair
# (term, category) -> chi-square
term_cat_chi_squared_rdd = term_cat_occ_rdd \
    .flatMap(calculate_chi_square) \
    .groupByKey()

CPU times: user 5 ms, sys: 1.11 ms, total: 6.11 ms
Wall time: 25.8 ms


## Extract the top 75 terms for each category (sorted by $\chi^2$)

In [12]:
%%time

# Perform a top K query for each category
topK = 75  # Number of terms to retrieve per category


def get_top_terms(pair):
    category, terms = pair
    top_terms = sorted(terms, key=lambda x: x[1], reverse=True)[:topK]  # Sort and retrieve top K terms
    return category, top_terms


# Apply the transformation and collect the results
results = term_cat_chi_squared_rdd.map(get_top_terms).sortByKey().collect()

                                                                                

CPU times: user 26.3 ms, sys: 4.15 ms, total: 30.4 ms
Wall time: 7.65 s


## Obtain all unique tokens from Top 75 for every category (ordered alphabetically)

In [13]:
%%time

tokens = sorted(set(term for _, top75_for_cat in results for term, _ in top75_for_cat))

CPU times: user 765 µs, sys: 0 ns, total: 765 µs
Wall time: 779 µs


## Write results to local file

In [14]:
# Format of each output line: "<category> term1:chi_squared1 term2:chi_squared2 ... term75:chi_squared75"
# finally, append the list of tokens to the end of the file
with open(f"output_rdd.txt", "w") as file:
    for pair in results:
        file.write("<%s>" % pair[0] + " ")
        for token, chi_square in pair[1]:
            file.write("%s:%f" % (token, chi_square) + " ")
        file.write("\n")
    file.write(" ".join(tokens) + "\n")

In [15]:
sc.stop()  # stop Spark context to free up resources