# Part 1: RDDs

Setup Spark

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

# Create a custom Spark config to maximize performance:
conf = (
    SparkConf()
    .set("spark.driver.memory", "4g")
    .set("spark.executor.memory", "7392m")
    .set("spark.parallelism", "4")
)
spark = (
    SparkSession.builder
    .appName("ChiSquaredRDD")
    .config(conf=conf)
    .getOrCreate()
)

sc = spark.sparkContext

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
23/06/01 12:24:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/01 12:24:44 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/06/01 12:24:44 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/06/01 12:24:44 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/06/01 12:24:44 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
23/06/01 12:24:4

define helper function for benchmarking steps of Chi-square computation 'pipeline'

## Load reviews as RDD

In [2]:
%%time

# review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviewscombined.json"
review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json"
# review_path = "hdfs:///user/e11809642/reviews/reduced_devset.json"
input_rdd = sc.textFile(review_path)

CPU times: user 1.44 ms, sys: 1.39 ms, total: 2.83 ms
Wall time: 654 ms


## Obtain stopwords

Load stopwords into (local) memory (Note: file contains duplicates, so convert to set)

In [3]:
%%time

stopwords_path = 'stopwords.txt'

def load_unique_lines(filename):
    lines = set()

    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace and newline characters
            lines.add(line)

    return lines


stopwords = load_unique_lines(stopwords_path) 

CPU times: user 0 ns, sys: 700 µs, total: 700 µs
Wall time: 603 µs


Note: local variables like this one will be automatically broadcast to all data nodes if accessed in any RDD transformation

## Compute total number of documents and number of documents per category

For this step, we can just count the number of documents by category and then sum up the number of documents per category to get the total number of documents.

We create an RDD for the category tag of each review (parsing the input JSON string and extracting the `category` attribute) and then compute the counts by calling `countByValue()` on it:

In [4]:
import json
category_rdd = (
    input_rdd
    .map(lambda inputstring: json.loads((inputstring))['category'])
)

In [5]:
%%time

category_counts = category_rdd.countByValue()
category_counts



CPU times: user 8.99 ms, sys: 6.15 ms, total: 15.1 ms
Wall time: 3.72 s


                                                                                

defaultdict(int,
            {'Patio_Lawn_and_Garde': 994,
             'Apps_for_Android': 2638,
             'Book': 22507,
             'Toys_and_Game': 2253,
             'Office_Product': 1243,
             'Digital_Music': 836,
             'Sports_and_Outdoor': 3269,
             'Automotive': 1374,
             'Beauty': 2023,
             'Musical_Instrument': 500,
             'CDs_and_Vinyl': 3749,
             'Kindle_Store': 3205,
             'Clothing_Shoes_and_Jewelry': 5749,
             'Electronic': 7825,
             'Home_and_Kitche': 4254,
             'Cell_Phones_and_Accessorie': 3447,
             'Pet_Supplie': 1235,
             'Movies_and_TV': 4607,
             'Baby': 916,
             'Tools_and_Home_Improvement': 1926,
             'Grocery_and_Gourmet_Food': 1297,
             'Health_and_Personal_Care': 2982})

This dict is really small (one key-value pair for each category) and will easily fit into memory on the datanodes.

In [6]:
review_count = sum(category_counts.values())
review_count

78829

Hmm, so is this alternative approach faster or was that just a coincidence?

## Obtain number of ocurrences of each term by category

First, define an RDD for extracting `category` and `reviewText` from each review JSON string:

In [7]:
%%time

category_review_rdd = (
    input_rdd
    .map(lambda json_str: json.loads(json_str))
    .map(lambda json_obj: (json_obj['category'], json_obj['reviewText']))
)

CPU times: user 971 µs, sys: 0 ns, total: 971 µs
Wall time: 844 µs


Then, tokenize the review texts and remove stopwords to obtain the terms. For each unique term appearing in each document, output a tuple of the form $$((term, category), 1)$$ where $term$ is the respective term and $category$ is the category the document is associated with

In [8]:
%%time

# define pattern for splitting/tokenizing
import re
pattern = re.compile(r"[^a-zA-Z<>^|]+")

def map_review_data(pair):
    category, review_text = pair
    # obtain set of unique(!) terms for each document via tokenization followed by stopword removal
    terms = [
        t
        for t in set(token.lower() for token in pattern.split(review_text))
        if t not in stopwords and len(t) >= 2
    ]
    return [((term, category), 1) for term in terms]

def remap(pair):
    term_and_cat, count = pair
    term, cat = term_and_cat
    return term, (cat, count)

term_cat_occ_rdd = (
    category_review_rdd.flatMap(
        map_review_data
    )
    .reduceByKey(lambda x, y: x + y)
    .map(remap) # can I avoid having to do this somehow?
    .groupByKey()
)

CPU times: user 11.4 ms, sys: 2.27 ms, total: 13.7 ms
Wall time: 68 ms


## Calculate Chi-square

We now have all the data to compute the $\chi^2$ metric

In [9]:
def calculate_chi_square(pair):
    term, term_counts_for_categories = pair
    term_count_list = [t for t in term_counts_for_categories]

    doc_count_for_cat = dict(term_count_list) # use to retrieve no. of documents containing term for a particular category
    total_doc_count_for_term = sum(doc_count_for_cat.values()) # total number of documents containing the term
    
    term_and_cat_chi_squared = []
    
    for category, count in doc_count_for_cat.items():
        a = count # number of documents in c which contain t
        b = total_doc_count_for_term - a # number of documents not in c which contain t
        total_doc_count_for_cat = category_counts[category] # total no. of documents for current category
        c = total_doc_count_for_cat - a # number of documents in c without t
        d = review_count - a - b - c # number of documents not in c without t
        term_and_cat_chi_squared.append(
            (
                category,
                (term, review_count * (a * d - b * c) ** 2 / ((a + b) * (a + c) * (b + d) * (c + d)))
            )
        )
    return term_and_cat_chi_squared

In [10]:
# quick test
cat1, cat2, cat3 = [c for c in category_counts.keys()][:3]
pair = ('term', [(cat1, 10), (cat2, 5), (cat3, 3)])
calculate_chi_square(pair)

[('Patio_Lawn_and_Garde', ('term', 426.28068327509436)),
 ('Apps_for_Android', ('term', 33.22447905348442)),
 ('Book', ('term', 1.246657147592458))]

In [11]:
%%time

# Compute the chi-squared value for each unique term and category pair
# (term, category) -> chi-square
term_cat_chi_squared_rdd = (term_cat_occ_rdd
    .flatMap(calculate_chi_square)
    .groupByKey()
)

CPU times: user 5.29 ms, sys: 1.29 ms, total: 6.58 ms
Wall time: 22.6 ms


## Extract top 75 terms for each category (sorted by $\chi^2$)

In [12]:
%%time

# Perform top K query for each category
topK = 75  # Number of terms to retrieve per category

def get_top_terms(pair):
    category, terms = pair
    top_terms = sorted(terms, key=lambda x: x[1], reverse=True)[:topK]  # Sort and retrieve top K terms
    return category, top_terms

# Apply the transformation and collect the results
results = term_cat_chi_squared_rdd.map(get_top_terms).sortByKey().collect()

                                                                                

CPU times: user 26.9 ms, sys: 8.57 ms, total: 35.4 ms
Wall time: 7.94 s


In [13]:
len(results) == len(category_counts)

True

In [14]:
results[0] # check if structure of output is as expected

('Apps_for_Android',
 [('games', 3081.1493374842926),
  ('play', 2158.3694068201294),
  ('graphics', 1505.5108977351497),
  ('kindle', 1470.820942569012),
  ('addictive', 1311.905562727777),
  ('challenging', 1038.1284558527927),
  ('coins', 1002.6647889526382),
  ('addicting', 990.8441134974868),
  ('fire', 956.1470053110605),
  ('levels', 825.3813282736016),
  ('playing', 692.9340396014182),
  ('ads', 642.3969794099202),
  ('puzzles', 596.7716753070063),
  ('apps', 548.7810653104153),
  ('free', 500.9884786241356),
  ('bingo', 409.2358492981346),
  ('mahjong', 322.00891943980963),
  ('download', 303.8649278202287),
  ('faotd', 288.8577201586641),
  ('facebook', 282.51705437029005),
  ('downloaded', 262.77022492215735),
  ('hints', 242.61029019440056),
  ('solitaire', 211.6429957838186),
  ('android', 211.58105849598613),
  ('puzzle', 198.85582217352504),
  ('gameplay', 198.5123356770461),
  ('freezes', 189.67737127837006),
  ('unlock', 185.7521008338788),
  ('played', 180.39650447458

Looks legit!?

## Obtain all unique tokens from Top 75 for every category (ordered alphabetically)

In [15]:
%%time

tokens = sorted(set(term for _, top75_for_cat in results for term, _ in top75_for_cat))

CPU times: user 766 µs, sys: 0 ns, total: 766 µs
Wall time: 786 µs


## Write results to local file

In [16]:
# format of each output line: "<category> term1:chi_squared1 term2:chi_squared2 ... term75:chi_squared75"
# finally, append the list of tokens to the end of the file
with open(f"output_rdd{'_full_dataset' if review_count > 7000000 else ''}.txt", "w") as file:
    for pair in results:
        file.write("<%s>" % pair[0] + " ")
        for token, chi_square in pair[1]:
            file.write("%s:%f" % (token, chi_square) + " ")
        file.write("\n")
    file.write(" ".join(tokens) + "\n")

In [17]:
sc.stop() # stop Spark context to free up resources