# Part 1: RDDs

Setup Spark

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ChiSquaredRDD").getOrCreate()
sc = spark.sparkContext

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
23/05/27 10:58:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/27 10:58:32 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


define helper function for benchmarking steps of Chi-square computation 'pipeline'

## Load reviews as RDD

In [2]:
%%time

review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviewscombined.json"
# review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json"
# review_path = "hdfs:///user/e11809642/reviews/reduced_devset.json"
input_rdd = sc.textFile(review_path)

CPU times: user 751 µs, sys: 2.32 ms, total: 3.07 ms
Wall time: 604 ms


## Obtain stopwords

Load stopwords into (local) memory (Note: file contains duplicates, so convert to set)

In [3]:
%%time

stopwords_path = 'stopwords.txt'

def load_unique_lines(filename):
    lines = set()

    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace and newline characters
            lines.add(line)

    return lines


stopwords = load_unique_lines(stopwords_path) 

CPU times: user 1.07 ms, sys: 0 ns, total: 1.07 ms
Wall time: 962 µs


Note: local variables like this one will be automatically broadcast to all data nodes if accessed in any RDD transformation

## Extract the category + review text

In [4]:
%%time

import json
category_review_rdd = input_rdd \
    .map(lambda json_str: json.loads(json_str)) \
    .map(lambda json_obj: (json_obj['category'], json_obj['reviewText']))

CPU times: user 1.38 ms, sys: 569 µs, total: 1.95 ms
Wall time: 31 ms


## Compute total number of documents and number of documents per category

We could do something like this:

In [6]:
#%%time

#review_count = category_review_rdd.count()



CPU times: user 690 ms, sys: 121 ms, total: 811 ms
Wall time: 8min


                                                                                

But actually, we can also just count the number of documents by category and then sum up the number of documents per category to get the total number of documents!

Just call `keys()` on the RDD for the review texts and then collect the counts with `countByValue()`:

In [5]:
%%time

category_counts = category_review_rdd.keys().countByValue()
category_counts



CPU times: user 676 ms, sys: 117 ms, total: 792 ms
Wall time: 7min 40s


                                                                                

defaultdict(int,
            {'Patio_Lawn_and_Garde': 993490,
             'Apps_for_Android': 2638173,
             'Book': 22507155,
             'Toys_and_Game': 2252771,
             'Office_Product': 1243186,
             'Digital_Music': 836006,
             'Sports_and_Outdoor': 3268695,
             'Automotive': 1373768,
             'Beauty': 2023070,
             'Musical_Instrument': 500176,
             'CDs_and_Vinyl': 3749004,
             'Kindle_Store': 3205467,
             'Clothing_Shoes_and_Jewelry': 5748920,
             'Electronic': 7824482,
             'Home_and_Kitche': 4253926,
             'Cell_Phones_and_Accessorie': 3447249,
             'Pet_Supplie': 1235316,
             'Movies_and_TV': 4607047,
             'Baby': 915446,
             'Tools_and_Home_Improvement': 1926047,
             'Grocery_and_Gourmet_Food': 1297156,
             'Health_and_Personal_Care': 2982326})

This dict is really small (one key-value pair for each category) and will easily fit into memory on the datanodes.

In [7]:
review_count = sum(category_counts.values())
review_count

78828876

### Testing alternative approach for obtaining category counts

In [18]:
category_rdd = (
    input_rdd
    .map(lambda inputstring: json.loads((inputstring))['category'])
)

In [20]:
%%time

category_counts_2 = category_rdd.countByValue()
category_counts_2



CPU times: user 600 ms, sys: 97.7 ms, total: 698 ms
Wall time: 6min 50s


                                                                                

defaultdict(int,
            {'Patio_Lawn_and_Garde': 993490,
             'Apps_for_Android': 2638173,
             'Book': 22507155,
             'Toys_and_Game': 2252771,
             'Office_Product': 1243186,
             'Digital_Music': 836006,
             'Sports_and_Outdoor': 3268695,
             'Automotive': 1373768,
             'Beauty': 2023070,
             'Musical_Instrument': 500176,
             'CDs_and_Vinyl': 3749004,
             'Kindle_Store': 3205467,
             'Clothing_Shoes_and_Jewelry': 5748920,
             'Electronic': 7824482,
             'Home_and_Kitche': 4253926,
             'Cell_Phones_and_Accessorie': 3447249,
             'Pet_Supplie': 1235316,
             'Movies_and_TV': 4607047,
             'Baby': 915446,
             'Tools_and_Home_Improvement': 1926047,
             'Grocery_and_Gourmet_Food': 1297156,
             'Health_and_Personal_Care': 2982326})

Hmm, so is this alternative approach faster or was that just a coincidence?

## Obtain number of ocurrences of each term by category

In [21]:
%%time

# define pattern for splitting/tokenizing
import re
pattern = re.compile(r"[^a-zA-Z<>^|]+")

def map_review_data(pair):
    category, review_text = pair
    # obtain set of unique(!) terms for each document via tokenization followed by stopword removal
    terms = [
        t
        for t in set(token.lower() for token in pattern.split(review_text))
        if t not in stopwords and len(t) >= 2
    ]
    return [((term, category), 1) for term in terms]

def remap(pair):
    term_and_cat, count = pair
    term, cat = term_and_cat
    return term, (cat, count)

term_cat_occ_rdd = (
    category_review_rdd.flatMap(
        map_review_data
    )
    .reduceByKey(lambda x, y: x + y)
    .map(remap) # can I avoid having to do this somehow?
    .groupByKey()
)

CPU times: user 17.6 ms, sys: 3.99 ms, total: 21.5 ms
Wall time: 83 ms


In [None]:
# DON'T run this when using full dataset - will take A LONG TIME (even though only 3 values are printed)
# term_cat_occ_rdd.take(3)

[Stage 3:===>                                                    (31 + 2) / 435]

## Calculate Chi-square

In [22]:
def calculate_chi_square(pair):
    term, term_counts_for_categories = pair
    term_count_list = [t for t in term_counts_for_categories]

    doc_count_for_cat = dict(term_count_list) # use to retrieve no. of documents containing term for a particular category
    total_doc_count_for_term = sum(doc_count_for_cat.values()) # total number of documents containing the term
    
    term_and_cat_chi_squared = []
    
    for category, count in doc_count_for_cat.items():
        a = count # number of documents in c which contain t
        b = total_doc_count_for_term - a # number of documents not in c which contain t
        total_doc_count_for_cat = category_counts[category] # total no. of documents for current category
        c = total_doc_count_for_cat - a # number of documents in c without t
        d = review_count - a - b - c # number of documents not in c without t
        term_and_cat_chi_squared.append(
            (
                category,
                (term, review_count * (a * d - b * c) ** 2 / ((a + b) * (a + c) * (b + d) * (c + d)))
            )
        )
    return term_and_cat_chi_squared

In [23]:
# quick test
cat1, cat2, cat3 = [c for c in category_counts.keys()][:3]
pair = ('term', [(cat1, 10), (cat2, 5), (cat3, 3)])
calculate_chi_square(pair)

[('Patio_Lawn_and_Garde', ('term', 426.4088951575854)),
 ('Apps_for_Android', ('term', 33.21413580372739)),
 ('Book', ('term', 1.246417097859693))]

In [24]:
%%time

# Compute the chi-squared value for each unique term and category pair
# (term, category) -> chi-square
term_cat_chi_squared_rdd = (term_cat_occ_rdd
    .flatMap(calculate_chi_square)
    .groupByKey()
)

CPU times: user 6.49 ms, sys: 1.38 ms, total: 7.87 ms
Wall time: 22.8 ms


In [25]:
%%time

# Perform top K query for each category
topK = 75  # Number of terms to retrieve per category

def get_top_terms(pair):
    category, terms = pair
    top_terms = sorted(terms, key=lambda x: x[1], reverse=True)[:topK]  # Sort and retrieve top K terms
    return category, top_terms

# Apply the transformation and collect the results
results = term_cat_chi_squared_rdd.map(get_top_terms).sortByKey().collect()



CPU times: user 1.92 s, sys: 470 ms, total: 2.39 s
Wall time: 1h 19min 58s


                                                                                

In [26]:
len(results)

22

In [27]:
len(category_counts)

22

In [28]:
results[0]

('Apps_for_Android',
 [('games', 2537037.1055596066),
  ('play', 2247113.8091989527),
  ('graphics', 1713265.4482352666),
  ('kindle', 1623429.5418573343),
  ('addicting', 1242808.2184094565),
  ('addictive', 1035936.0441975455),
  ('challenging', 992713.5393625504),
  ('coins', 955141.6796828125),
  ('fire', 873237.7713834605),
  ('playing', 858860.1408184121),
  ('puzzles', 685415.5497323661),
  ('apps', 581592.8839890066),
  ('levels', 524737.622313832),
  ('free', 501001.8215785628),
  ('download', 468874.6800539256),
  ('downloaded', 401933.9544030182),
  ('ads', 391326.5745358066),
  ('bingo', 326777.76631461567),
  ('gameplay', 292273.0536111694),
  ('flappy', 278066.6912266814),
  ('uninstalled', 268092.42706600856),
  ('facebook', 244862.80021059277),
  ('mahjong', 224833.17233451147),
  ('solitaire', 208979.84293331063),
  ('puzzle', 202466.74910689172),
  ('played', 195723.39605319453),
  ('faotd', 192946.7262648941),
  ('waster', 191535.25318696845),
  ('deleted', 177813.00

Looks legit!?

## Obtain unique tokens (ordered alphabetically)

In [29]:
%%time

tokens = term_cat_occ_rdd.keys().distinct().sortBy(lambda x: x).collect()

                                                                                

CPU times: user 1.08 s, sys: 642 ms, total: 1.72 s
Wall time: 1min 26s


## Write results to local file

In [30]:
# format of each output line: "<category> term1:chi_squared1 term2:chi_squared2 ... term75:chi_squared75"
# finally, append the list of tokens to the end of the file
with open(f"output_rdd{'_full_dataset' if review_count > 7000000 else ''}.txt", "a") as file:
    for pair in results:
        file.write("<%s>" % pair[0] + " ")
        for token, chi_square in pair[1]:
            file.write("%s:%f" % (token, chi_square) + " ")
        file.write("\n")
    file.write(" ".join(tokens) + "\n")

In [31]:
sc.stop()
