# Part 1: RDDs

Setup Spark

In [45]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ChiSquaredRDD").getOrCreate()
sc = spark.sparkContext

## Load reviews as RDD

In [3]:
# review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviewscombined.json"
review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json"
# review_path = "hdfs:///user/e11809642/reviews/reduced_devset.json"
input_rdd = sc.textFile(review_path)

## Obtain stopwords

Load stopwords into (local) memory (Note: file contains duplicates, so convert to set)

In [4]:
stopwords_path = 'stopwords.txt'

def load_unique_lines(filename):
    lines = set()

    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace and newline characters
            lines.add(line)

    return lines


stopwords = load_unique_lines(stopwords_path) 

Note: local variables like this one will be automatically broadcast to all data nodes if accessed in any RDD transformation

## Parse JSON strings, extract the category + review text

In [5]:
import json
category_review_rdd = input_rdd \
    .map(lambda json_str: json.loads(json_str)) \
    .map(lambda json_obj: (json_obj['category'], json_obj['reviewText']))

## Compute total number of documents

In [6]:
review_count = category_review_rdd.count()

                                                                                

## Compute number of documents per category

define RDD with required transformations

In [7]:
category_counts_rdd = category_review_rdd \
    .map(lambda pair: (pair[0], 1)) \
    .reduceByKey(lambda x, y: x + y)

Next, collect the number of documents per category (values of the RDD above) into a local dictionary. This dict is really small (one key-value pair for each category) and will easily fit into memory on the datanodes.

In [8]:
category_counts = dict(category_counts_rdd.collect())
#category_counts = category_counts_rdd.collect()

                                                                                

In [9]:
category_counts['Musical_Instrument']

500

## Obtain number of ocurrences of each term by category

In [72]:
# define pattern for splitting/tokenizing
import re
pattern = re.compile(r"[^a-zA-Z<>^|]+")

def map_review_data(pair):
    category, review_text = pair
    # obtain terms via tokenization followed by stopword removal
    terms = [
        t
        for t in set(token.lower() for token in pattern.split(review_text))
        if t not in stopwords and len(t) >= 2
    ]
    return [((term, category), 1) for term in terms]

def remap(pair):
    term_and_cat, count = pair
    term, cat = term_and_cat
    return term, (cat, count)

term_cat_occ_rdd = (
    category_review_rdd.flatMap(
        map_review_data
    )
    .reduceByKey(lambda x, y: x + y)
    .map(remap) # can I avoid having to do this somehow?
    .groupByKey()
)

In [73]:
term_cat_occ_rdd.take(3)

                                                                                

[('insight', <pyspark.resultiterable.ResultIterable at 0x7f03f35bd9d0>),
 ('things', <pyspark.resultiterable.ResultIterable at 0x7f03f35bde20>),
 ('raichlen', <pyspark.resultiterable.ResultIterable at 0x7f03f35bd250>)]

## Compute the number of occurrences of each term across all reviews

Not sure if this is required at all

In [12]:
term_occ_rdd = term_cat_occ_rdd \
    .map(lambda pair: (pair[0][0], pair[1])) \
    .reduceByKey(lambda x, y: x + y)

## Calculate Chi-square

In [82]:
def calculate_chi_square(pair):
    term, term_counts_for_categories = pair
    term_count_list = [t for t in term_counts_for_categories]

    doc_count_for_cat = dict(term_count_list) # use to retrieve no. of documents containing term for a particular category
    total_doc_count_for_term = sum(doc_count_for_cat.values()) # total number of documents containing the term
    
    term_and_cat_chi_squared = []
    
    for category, count in doc_count_for_cat.items():
        a = count # number of documents in c which contain t
        b = total_doc_count_for_term - a # number of documents not in c which contain t
        total_doc_count_for_cat = category_counts[category] # total no. of documents for current category
        c = total_doc_count_for_cat - a # number of documents in c without t
        d = review_count - a - b - c # number of documents not in c without t
        term_and_cat_chi_squared.append(
            (
                category,
                (term, review_count * (a * d - b * c) ** 2 / ((a + b) * (a + c) * (b + d) * (c + d)))
            )
        )
    return term_and_cat_chi_squared

In [83]:
# quick test
cat1, cat2, cat3 = [c for c in category_counts.keys()][:3]
pair = ('term', [(cat1, 10), (cat2, 5), (cat3, 3)])
calculate_chi_square(pair)

[('Apps_for_Android', ('term', 151.725015831567)),
 ('Book', ('term', 0.00528584930912985)),
 ('Toys_and_Game', ('term', 12.364825215808038))]

In [96]:
# Compute the chi-squared value for each unique term and category pair
# (term, category) -> chi-square
term_cat_chi_squared_rdd = (term_cat_occ_rdd
    .flatMap(calculate_chi_square)
    .groupByKey()
)

In [97]:
term_cat_chi_squared_rdd.take(3)

                                                                                

[('Apps_for_Android',
  <pyspark.resultiterable.ResultIterable at 0x7f03f3654f70>),
 ('Book', <pyspark.resultiterable.ResultIterable at 0x7f03f36465b0>),
 ('Toys_and_Game', <pyspark.resultiterable.ResultIterable at 0x7f03f5a5f490>)]

In [114]:
# Perform top K query for each category
topK = 75  # Number of terms to retrieve per category

def get_top_terms(pair):
    category, terms = pair
    top_terms = sorted(terms, key=lambda x: x[1], reverse=True)[:topK]  # Sort and retrieve top K terms
    return category, top_terms

# Apply the transformation and collect the results
results = term_cat_chi_squared_rdd.map(get_top_terms).sortByKey().collect()

In [115]:
len(results)

22

In [116]:
len(category_counts)

22

In [117]:
results[0]

('Apps_for_Android',
 [('games', 3081.1493374842926),
  ('play', 2158.3694068201294),
  ('graphics', 1505.5108977351497),
  ('kindle', 1470.820942569012),
  ('addictive', 1311.905562727777),
  ('challenging', 1038.1284558527927),
  ('coins', 1002.6647889526382),
  ('addicting', 990.8441134974868),
  ('fire', 956.1470053110605),
  ('levels', 825.3813282736016),
  ('playing', 692.9340396014182),
  ('ads', 642.3969794099202),
  ('puzzles', 596.7716753070063),
  ('apps', 548.7810653104153),
  ('free', 500.9884786241356),
  ('bingo', 409.2358492981346),
  ('mahjong', 322.00891943980963),
  ('download', 303.8649278202287),
  ('faotd', 288.8577201586641),
  ('facebook', 282.51705437029005),
  ('downloaded', 262.77022492215735),
  ('hints', 242.61029019440056),
  ('solitaire', 211.6429957838186),
  ('android', 211.58105849598613),
  ('puzzle', 198.85582217352504),
  ('gameplay', 198.5123356770461),
  ('freezes', 189.67737127837006),
  ('unlock', 185.7521008338788),
  ('played', 180.39650447458

Looks legit!?

## Obtain unique tokens (ordered alphabetically)

In [120]:
tokens = term_cat_occ_rdd.keys().distinct().sortBy(lambda x: x).collect()

## Write results to local file

In [121]:
# format of each output line: "<category> term1:chi_squared1 term2:chi_squared2 ... term75:chi_squared75"
# finally, append the list of tokens to the end of the file
with open("chi_square.txt", "a") as file:
    for pair in results:
        file.write("<%s>" % pair[0] + " ")
        for token, chi_square in pair[1]:
            file.write("%s:%f" % (token, chi_square) + " ")
        file.write("\n")
    file.write(" ".join(tokens) + "\n")

In [122]:
sc.stop()
