In [1]:
import os
import emoji
import contractions
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType, StructType, StructField, FloatType, LongType, BooleanType
from pyspark.ml import Pipeline
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import Tokenizer, LemmatizerModel
from pyspark.ml.feature import StopWordsRemover

# CONFIGURATION
spark = SparkSession.builder \
    .appName("Amazon_Reviews_Dual_ETL") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "2g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:6.3.0") \
    .getOrCreate()

# Set Shuffle Partitions to 2000
# This prevents the Disk Spill and speeds up the job significantly
spark.conf.set("spark.sql.shuffle.partitions", "2000")

BASE_DIR = '/home/ubuntu/data/aaucr/'
json_path = os.path.join(BASE_DIR, '*.json')
output_path = 'reviews_final_parquet'

In [2]:
abbreviations = {
    "$": " dollar ",
    "€": " euro ",
    "4ao": "for adults only",
    "a.m": "before midday",
    "a3": "anytime anywhere anyplace",
    "aamof": "as a matter of fact",
    "acct": "account",
    "adih": "another day in hell",
    "afaic": "as far as i am concerned",
    "afaict": "as far as i can tell",
    "afaik": "as far as i know",
    "afair": "as far as i remember",
    "afk": "away from keyboard",
    "app": "application",
    "approx": "approximately",
    "apps": "applications",
    "asap": "as soon as possible",
    "asl": "age, sex, location",
    "atk": "at the keyboard",
    "ave.": "avenue",
    "aymm": "are you my mother",
    "ayor": "at your own risk",
    "b&b": "bed and breakfast",
    "b+b": "bed and breakfast",
    "b.c": "before christ",
    "b2b": "business to business",
    "b2c": "business to customer",
    "b4": "before",
    "b4n": "bye for now",
    "b@u": "back at you",
    "bae": "before anyone else",
    "bak": "back at keyboard",
    "bbbg": "bye bye be good",
    "bbc": "british broadcasting corporation",
    "bbias": "be back in a second",
    "bbl": "be back later",
    "bbs": "be back soon",
    "be4": "before",
    "bfn": "bye for now",
    "blvd": "boulevard",
    "bout": "about",
    "brb": "be right back",
    "bros": "brothers",
    "brt": "be right there",
    "bsaaw": "big smile and a wink",
    "btw": "by the way",
    "bwl": "bursting with laughter",
    "c/o": "care of",
    "cet": "central european time",
    "cf": "compare",
    "cia": "central intelligence agency",
    "csl": "can not stop laughing",
    "cu": "see you",
    "cul8r": "see you later",
    "cv": "curriculum vitae",
    "cwot": "complete waste of time",
    "cya": "see you",
    "cyt": "see you tomorrow",
    "dae": "does anyone else",
    "dbmib": "do not bother me i am busy",
    "diy": "do it yourself",
    "dm": "direct message",
    "dwh": "during work hours",
    "e123": "easy as one two three",
    "eet": "eastern european time",
    "eg": "example",
    "embm": "early morning business meeting",
    "encl": "enclosed",
    "encl.": "enclosed",
    "etc": "and so on",
    "faq": "frequently asked questions",
    "fawc": "for anyone who cares",
    "fb": "facebook",
    "fc": "fingers crossed",
    "fig": "figure",
    "fimh": "forever in my heart",
    "ft.": "feet",
    "ft": "featuring",
    "ftl": "for the loss",
    "ftw": "for the win",
    "fwiw": "for what it is worth",
    "fyi": "for your information",
    "g9": "genius",
    "gahoy": "get a hold of yourself",
    "gal": "get a life",
    "gcse": "general certificate of secondary education",
    "gfn": "gone for now",
    "gg": "good game",
    "gl": "good luck",
    "glhf": "good luck have fun",
    "gmt": "greenwich mean time",
    "gmta": "great minds think alike",
    "gn": "good night",
    "g.o.a.t": "greatest of all time",
    "goat": "greatest of all time",
    "goi": "get over it",
    "gps": "global positioning system",
    "gr8": "great",
    "gratz": "congratulations",
    "gyal": "girl",
    "h&c": "hot and cold",
    "hp": "horsepower",
    "hr": "hour",
    "hrh": "his royal highness",
    "ht": "height",
    "ibrb": "i will be right back",
    "ic": "i see",
    "icq": "i seek you",
    "icymi": "in case you missed it",
    "idc": "i do not care",
    "idgadf": "i do not give a damn fuck",
    "idgaf": "i do not give a fuck",
    "idk": "i do not know",
    "ie": "that is",
    "i.e": "that is",
    "ifyp": "i feel your pain",
    "IG": "instagram",
    "iirc": "if i remember correctly",
    "ilu": "i love you",
    "ily": "i love you",
    "imho": "in my humble opinion",
    "imo": "in my opinion",
    "imu": "i miss you",
    "iow": "in other words",
    "irl": "in real life",
    "j4f": "just for fun",
    "jic": "just in case",
    "jk": "just kidding",
    "jsyk": "just so you know",
    "l8r": "later",
    "lb": "pound",
    "lbs": "pounds",
    "ldr": "long distance relationship",
    "lmao": "laugh my ass off",
    "lmfao": "laugh my fucking ass off",
    "lol": "laughing out loud",
    "ltd": "limited",
    "ltns": "long time no see",
    "m8": "mate",
    "mf": "motherfucker",
    "mfs": "motherfuckers",
    "mfw": "my face when",
    "mofo": "motherfucker",
    "mph": "miles per hour",
    "mr": "mister",
    "mrw": "my reaction when",
    "ms": "miss",
    "mte": "my thoughts exactly",
    "nagi": "not a good idea",
    "nbc": "national broadcasting company",
    "nbd": "not big deal",
    "nfs": "not for sale",
    "ngl": "not going to lie",
    "nhs": "national health service",
    "nrn": "no reply necessary",
    "nsfl": "not safe for life",
    "nsfw": "not safe for work",
    "nth": "nice to have",
    "nvr": "never",
    "nyc": "new york city",
    "oc": "original content",
    "og": "original",
    "ohp": "overhead projector",
    "oic": "oh i see",
    "omdb": "over my dead body",
    "omg": "oh my god",
    "omw": "on my way",
    "p.a": "per annum",
    "p.m": "after midday",
    "pm": "prime minister",
    "poc": "people of color",
    "pov": "point of view",
    "pp": "pages",
    "ppl": "people",
    "prw": "parents are watching",
    "ps": "postscript",
    "pt": "point",
    "ptb": "please text back",
    "pto": "please turn over",
    "qpsa": "what happens",
    "ratchet": "rude",
    "rbtl": "read between the lines",
    "rlrt": "real life retweet",
    "rofl": "rolling on the floor laughing",
    "roflol": "rolling on the floor laughing out loud",
    "rotflmao": "rolling on the floor laughing my ass off",
    "rt": "retweet",
    "ruok": "are you ok",
    "sfw": "safe for work",
    "sk8": "skate",
    "smh": "shake my head",
    "sq": "square",
    "srsly": "seriously",
    "ssdd": "same stuff different day",
    "tbh": "to be honest",
    "tbs": "tablespooful",
    "tbsp": "tablespooful",
    "tfw": "that feeling when",
    "thks": "thank you",
    "tho": "though",
    "thx": "thank you",
    "tia": "thanks in advance",
    "til": "today i learned",
    "tl;dr": "too long i did not read",
    "tldr": "too long i did not read",
    "tmb": "tweet me back",
    "tntl": "trying not to laugh",
    "ttyl": "talk to you later",
    "u": "you",
    "u2": "you too",
    "u4e": "yours for ever",
    "utc": "coordinated universal time",
    "w/": "with",
    "w/o": "without",
    "w8": "wait",
    "wassup": "what is up",
    "wb": "welcome back",
    "wtf": "what the fuck",
    "gtfo": "get the fuck out",
    "wtg": "way to go",
    "wtpa": "where the party at",
    "wuf": "where are you from",
    "wuzup": "what is up",
    "wywh": "wish you were here",
    "yd": "yard",
    "ygtr": "you got that right",
    "ynk": "you never know",
    "zzz": "sleeping bored and tired"
}

In [None]:
broadcast_abbr = spark.sparkContext.broadcast(abbreviations)
stop_words_list = set(StopWordsRemover.loadDefaultStopWords("english"))
broadcast_stops = spark.sparkContext.broadcast(stop_words_list)

def replace_logic(text):
    """Cleaning: Emojis, Abbreviations, Contractions."""
    if not text: return ""
    text = emoji.demojize(text)
    bc = broadcast_abbr.value
    words = text.split()
    text = " ".join([bc.get(w.lower(), w) for w in words])
    text = contractions.fix(text)
    return text

def filter_tokens_bilstm(tokens):
    """
    Bi-LSTM Logic: KEEP stop words (context), REMOVE short junk.
    Logic: Keep if (is stopword) OR (len > 2).
    """
    if not tokens: return []
    stops = broadcast_stops.value
    return [word for word in tokens if (word in stops or len(word) > 2)]

replace_udf = F.udf(replace_logic, StringType())
filter_bilstm_udf = F.udf(filter_tokens_bilstm, ArrayType(StringType()))

custom_schema = StructType([
    StructField("overall", FloatType(), True),
    StructField("vote", StringType(), True),
    StructField("verified", BooleanType(), True),
    StructField("reviewerID", StringType(), True),
    StructField("asin", StringType(), True),
    StructField("reviewText", StringType(), True),
    StructField("summary", StringType(), True),
    StructField("unixReviewTime", LongType(), True),
])

In [None]:
def process_and_save():
    print(f"--- Loading data from {json_path} ---")
    df = spark.read.schema(custom_schema).json(json_path)

    files_read = df.inputFiles()
    print(f"--- Detected {len(files_read)} JSON files: ---")
    for fname in files_read:
        print(fname)

    print("--- Repartitioning Raw Data (Fixing Input Skew) ---")
    df = df.repartition(4000)

    print("--- Applying 5-core Filtering ---")
    item_counts = df.groupBy("asin").count().filter(F.col("count") >= 5)
    df = df.join(item_counts, "asin", "left_semi")
    user_counts = df.groupBy("reviewerID").count().filter(F.col("count") >= 5)
    df = df.join(user_counts, "reviewerID", "left_semi")

    # Basic Conversions
    df = df.withColumn("reviewDate", F.from_unixtime(F.col("unixReviewTime")).cast("timestamp"))
    df = df.withColumn("voteCleaned", F.regexp_replace(F.col("vote"), ",", "").cast("int")).fillna(0, ["voteCleaned"])
    df = df.filter((F.col("reviewText").isNotNull()) & (F.col("reviewText") != ""))

    print("--- Cleaning Text (Regex, Abbreviations, Contractions) ---")
    col_name = "reviewText"
    
    df = df.withColumn(col_name, F.lower(F.col(col_name)))
    df = df.withColumn(col_name, F.regexp_replace(F.col(col_name), r"<[^>]*>", " "))  # HTML
    df = df.withColumn(col_name, F.regexp_replace(F.col(col_name), r"http\S+|www\S+", " ")) # URL
    df = df.withColumn(col_name, F.regexp_replace(F.col(col_name), r"\n", " ")) # Newline
    
    # Abbr + Contractions
    df = df.withColumn(col_name, replace_udf(F.col(col_name)))
    
    # Keep only letters and spaces
    df = df.withColumn(col_name, F.regexp_replace(F.col(col_name), r"[^A-Za-z\s]", " "))
    df = df.withColumn(col_name, F.trim(F.regexp_replace(F.col(col_name), r"\s+", " ")))

    # --- BRANCH A: CREATE EDA COLUMN (No Stop Words) ---
    print("--- Generating EDA Column (Removing Stop Words) ---")
    df = df.withColumn("temp_tokens", F.split(F.col(col_name), " "))
    
    remover = StopWordsRemover(inputCol="temp_tokens", outputCol="review_tokens_nostop")
    df = remover.transform(df)
    
    df = df.withColumn("review_tokens_nostop", F.expr("filter(review_tokens_nostop, x -> x != '')"))
    df = df.drop("temp_tokens")

    # --- BRANCH B: CREATE BI-LSTM COLUMN (Lemmatized + Context Kept) ---
    print("--- Generating Bi-LSTM Column (Lemmatizing) ---")
    
    document_assembler = DocumentAssembler() \
        .setInputCol(col_name) \
        .setOutputCol("document")

    tokenizer = Tokenizer() \
        .setInputCols(["document"]) \
        .setOutputCol("token")

    lemmatizer = LemmatizerModel.pretrained("lemma_antbnc", "en") \
        .setInputCols(["token"]) \
        .setOutputCol("lemma")

    finisher = Finisher() \
        .setInputCols(["lemma"]) \
        .setOutputCols(["raw_lemmas"]) \
        .setCleanAnnotations(True)

    nlp_pipeline = Pipeline(stages=[document_assembler, tokenizer, lemmatizer, finisher])
    df = nlp_pipeline.fit(df).transform(df)

    df = df.withColumn("lemmatized_tokens", filter_bilstm_udf(F.col("raw_lemmas")))
    
    df = df.drop("document", "token", "lemma", "raw_lemmas")

    print(f"--- Writing Final Data to {output_path} ---")

    # coalesce(200) creates fewer, larger files, which is much safer for HDFS.
    df.coalesce(200).write.mode("overwrite").parquet(output_path)
    print("Success! Data saved.")

process_and_save()

--- Loading data from /home/ubuntu/data/aaucr/*.json ---
--- Detected 29 JSON files: ---
hdfs://master:9000/home/ubuntu/data/aaucr/Gift_Cards_5.json
hdfs://master:9000/home/ubuntu/data/aaucr/Video_Games_5.json
hdfs://master:9000/home/ubuntu/data/aaucr/Toys_and_Games_5.json
hdfs://master:9000/home/ubuntu/data/aaucr/Kindle_Store_5.json
hdfs://master:9000/home/ubuntu/data/aaucr/Movies_and_TV_5.json
hdfs://master:9000/home/ubuntu/data/aaucr/Sports_and_Outdoors_5.json
hdfs://master:9000/home/ubuntu/data/aaucr/Pet_Supplies_5.json
hdfs://master:9000/home/ubuntu/data/aaucr/All_Beauty_5.json
hdfs://master:9000/home/ubuntu/data/aaucr/Home_and_Kitchen_5.json
hdfs://master:9000/home/ubuntu/data/aaucr/CDs_and_Vinyl_5.json
hdfs://master:9000/home/ubuntu/data/aaucr/Electronics_5.json
hdfs://master:9000/home/ubuntu/data/aaucr/Cell_Phones_and_Accessories_5.json
hdfs://master:9000/home/ubuntu/data/aaucr/Patio_Lawn_and_Garden_5.json
hdfs://master:9000/home/ubuntu/data/aaucr/Luxury_Beauty_5.json
hdfs://ma

26/01/15 13:07:42 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.
26/01/15 13:07:42 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.
[ / ]



[ — ]

                                                                                

[OK!]




--- Writing Final Data to reviews_final_parquet ---


[Stage 2:>(119 + 6) / 371][Stage 3:>  (0 + 0) / 371][Stage 4:>  (0 + 0) / 371]1]