### 2. Spark – Language Models - in Spark - 50 points

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, lower, regexp_replace, split, sum, hour, when, count, rank, concat_ws, struct, collect_list
from pyspark.ml.feature import Tokenizer, NGram
from pyspark.sql.window import Window


In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Language Models with Trigrams").getOrCreate()

# Load the text data
text_files = ["data/20-01.txt", "data/20-02.txt", "data/20-03.txt", "data/20-04.txt", "data/20-05.txt"]
df_text = spark.read.text(text_files)

# Prepare the text data
df_cleaned = df_text.select(lower(regexp_replace(col("value"), "[^0-9a-zA-Z]+", " ")).alias("text"))
df_cleaned

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/04 13:08:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


DataFrame[text: string]

In [3]:
# Tokenize the text
tokenizer = Tokenizer(inputCol="text", outputCol="words")
df_words = tokenizer.transform(df_cleaned)

# Generate trigrams
ngram = NGram(n=3, inputCol="words", outputCol="trigrams")
df_trigrams = ngram.transform(df_words)

In [4]:
# Explode trigrams into separate rows and count occurrences
df_trigram_counts = df_trigrams.select(explode(col("trigrams")).alias("trigram")).groupBy("trigram").count()

# Get the top 10 trigrams
top_trigrams = df_trigram_counts.orderBy(col("count").desc()).limit(10)
top_trigrams.show(truncate=False)

24/11/04 13:08:39 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

+------------------+-----+
|trigram           |count|
+------------------+-----+
|lt p gt           |1928 |
|the covid 19      |1718 |
|do n t            |1662 |
|of covid 19       |1589 |
|the spread of     |1196 |
|p gt lt           |1037 |
|the number of     |1037 |
|gt lt p           |1023 |
|one of the        |953  |
|of the coronavirus|907  |
+------------------+-----+



In [5]:
# Split trigram into individual words
df_split_trigrams = top_trigrams.withColumn("words", split(col("trigram"), " "))

# Create a window specification
windowSpec = Window.partitionBy(col("words")[0], col("words")[1])

# Compute conditional probabilities
df_probabilities = df_split_trigrams.withColumn("probability", col("count") / sum("count").over(windowSpec))

df_probabilities.select(col("trigram"), col("probability")).show(truncate=False)



+------------------+-----------+
|trigram           |probability|
+------------------+-----------+
|do n t            |1.0        |
|gt lt p           |1.0        |
|lt p gt           |1.0        |
|of covid 19       |1.0        |
|of the coronavirus|1.0        |
|one of the        |1.0        |
|p gt lt           |1.0        |
|the covid 19      |1.0        |
|the number of     |1.0        |
|the spread of     |1.0        |
+------------------+-----------+



                                                                                

### 3. Ranking over Partitions – in Spark. - 50 points

In [6]:
spark = SparkSession.builder.appName("Daypart Item Ranking").getOrCreate()

df_bakery = spark.read.csv('shared/data/Bakery.csv', header=True, inferSchema=True)
df_bakery.show(5)

24/11/04 13:09:39 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+----------+-------------------+-----------+-------------+
|      Date|               Time|Transaction|         Item|
+----------+-------------------+-----------+-------------+
|2016-10-30|2024-11-04 09:58:11|          1|        Bread|
|2016-10-30|2024-11-04 10:05:34|          2| Scandinavian|
|2016-10-30|2024-11-04 10:05:34|          2| Scandinavian|
|2016-10-30|2024-11-04 10:07:57|          3|Hot chocolate|
|2016-10-30|2024-11-04 10:07:57|          3|          Jam|
+----------+-------------------+-----------+-------------+
only showing top 5 rows



In [7]:
df_bakery = df_bakery.withColumn(
    "Daypart",
    when((hour("Time") >= 6) & (hour("Time") < 11), "morning")
    .when((hour("Time") >= 11) & (hour("Time") < 14), "noon")
    .when((hour("Time") >= 14) & (hour("Time") < 17), "afternoon")
    .otherwise("evening")
)

# Count Items Sold Per Daypart
df_daypart_items = df_bakery.groupBy("Daypart", "Item").agg(count("Item").alias("ItemCount"))
df_daypart_items.show(5)

[Stage 18:>                                                         (0 + 1) / 1]

+-------+----------------+---------+
|Daypart|            Item|ItemCount|
+-------+----------------+---------+
|   noon|    Bare Popcorn|        1|
|   noon|My-5 Fruit Shoot|        7|
|morning|  Jammie Dodgers|       22|
|   noon|Christmas common|        5|
|evening|        Focaccia|        3|
+-------+----------------+---------+
only showing top 5 rows



                                                                                

In [8]:
windowSpec = Window.partitionBy("Daypart").orderBy(col("ItemCount").desc())
# Rank Items Within Each Daypart
df_ranked_items = df_daypart_items.withColumn("Rank", rank().over(windowSpec))
df_ranked_items.show(5)

+---------+--------+---------+----+
|  Daypart|    Item|ItemCount|Rank|
+---------+--------+---------+----+
|afternoon|  Coffee|     1476|   1|
|afternoon|   Bread|      847|   2|
|afternoon|     Tea|      566|   3|
|afternoon|    Cake|      480|   4|
|afternoon|Sandwich|      275|   5|
+---------+--------+---------+----+
only showing top 5 rows



In [9]:
# Filter for Top 3 Items
df_top_items = df_ranked_items.filter(col("Rank") <= 3)

# Format the Output using concat_ws
df_final_output = df_top_items.groupBy("Daypart").agg(concat_ws(", ", collect_list(col("Item"))).alias("TopItems"))

df_final_output.show(truncate=False)

+---------+---------------------+
|Daypart  |TopItems             |
+---------+---------------------+
|afternoon|Coffee, Bread, Tea   |
|evening  |Coffee, Bread, Tea   |
|morning  |Coffee, Bread, Pastry|
|noon     |Coffee, Bread, Tea   |
+---------+---------------------+



### 4. Duplicate Detection with Minhash – 50 points

In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, MinHashLSH, CountVectorizer
from pyspark.ml import Pipeline

spark = SparkSession.builder.appName("MinHash LSH for Similarity Detection") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

df = spark.read.json("shared/data/Huffpost.json")

# Show the data structure
df.printSchema()
df.show(5, truncate=False)

24/11/04 13:09:45 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

root
 |-- authors: string (nullable = true)
 |-- category: string (nullable = true)
 |-- date: string (nullable = true)
 |-- headline: string (nullable = true)
 |-- link: string (nullable = true)
 |-- short_description: string (nullable = true)

+--------------------+---------+----------+-----------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
|authors             |category |date      |headline                                                                                 |link                                                                                                                                  |short_description                                        

In [11]:
tokenizer = Tokenizer(inputCol="short_description", outputCol="tokens")
vectorizer = CountVectorizer(inputCol="tokens", outputCol="features")

# Build the pipeline
pipeline = Pipeline(stages=[tokenizer, vectorizer])
model = pipeline.fit(df)
df_transformed = model.transform(df)

                                                                                

In [12]:
minhash = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
model_lsh = minhash.fit(df_transformed)
df_lsh = model_lsh.transform(df_transformed)

In [13]:
# Find the feature vector of the base item
base_description = "Kitten Born With Twisted Arms And Legs Finds A Mom Who Knows She\u2019s Perfect"
base_df = spark.createDataFrame([(base_description,)], ["short_description"])
base_transformed = model.transform(base_df)

# Join the base item with the dataset to find similarities
base_hashed = model_lsh.transform(base_transformed)
join_results = model_lsh.approxSimilarityJoin(base_hashed, df_lsh, float("inf"), distCol="JaccardDistance")

# Select the top 5 most similar item
join_results.select("datasetA.short_description", "datasetB.short_description", "JaccardDistance").orderBy("JaccardDistance") \
    .filter("datasetA.short_description != datasetB.short_description") \
    .show(5, truncate=False)

24/11/04 13:10:13 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
24/11/04 13:10:14 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
24/11/04 13:11:01 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
24/11/04 13:11:11 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
[Stage 47:>                                                         (0 + 2) / 2]

+--------------------------------------------------------------------------+---------------------------------------------------------------------------------------------+------------------+
|short_description                                                         |short_description                                                                            |JaccardDistance   |
+--------------------------------------------------------------------------+---------------------------------------------------------------------------------------------+------------------+
|Kitten Born With Twisted Arms And Legs Finds A Mom Who Knows She’s Perfect|”Maybe she’s born with it ... Maybe she’s a tired mom who doesn’t have time for this.”       |0.75              |
|Kitten Born With Twisted Arms And Legs Finds A Mom Who Knows She’s Perfect|With a back flip and everything.                                                             |0.8235294117647058|
|Kitten Born With Twisted Arms And Legs Finds A Mo

                                                                                