In [1]:
# Import
from pyspark.sql import SparkSession
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql.functions import col
from pyspark.sql.functions import collect_list, collect_set # collect_set return set of values (no duplicate)
from pyspark.sql import functions as F
from itertools import combinations
from pyspark.sql.functions import col, udf, explode
from pyspark.sql.types import ArrayType, StringType
import findspark
findspark.init()

In [2]:
# Create SparkSession
spark = SparkSession.builder\
    .master("local[*]") \
    .appName("Map Reduce with Pyspark") \
    .config("spark.driver.memory", "16g") \
    .getOrCreate()

24/12/30 18:38:17 WARN Utils: Your hostname, helium resolves to a loopback address: 127.0.1.1; using 10.0.106.13 instead (on interface enp6s0)
24/12/30 18:38:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/30 18:38:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_path = "dataset_sales/joined_all_data.csv"
df = spark.read.csv(data_path, header=True)


In [None]:
df.show(12)

In [None]:
# Sample 10% of the rows without replacement
sampled_df = df.sample(withReplacement=False, fraction=0.1, seed=42)

sampled_df.show()

In [None]:
# Count unique values in the 'product_name' column
value_counts = df.groupBy("product_name").count()

# Show the result
value_counts.show()


In [9]:
# Group and count unique product names
value_counts = df.groupBy("product_name").count()
order_id_counts = df.groupBy("order_id").count()

In [None]:
order_id_counts.show()

In [10]:
# Sort by count and limit to top 10
top_value_counts = value_counts.orderBy("count", ascending=False).limit(10)


In [None]:
# Convert to Pandas for visualization
top_value_counts_pd = top_value_counts.toPandas()
# Plot a bar chart
plt.figure(figsize=(16, 8))
plt.bar(top_value_counts_pd['product_name'], top_value_counts_pd['count'])
plt.xlabel('Product Name')
plt.ylabel('Count')
plt.title('Top 20 Most Common Products')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Filter rows where product_name is null
null_values_df = df.filter(col("product_name").isNull())

# Count the number of null values
null_count = null_values_df.count()

if null_count > 0:
    print(f"The column 'product_name' has {null_count} null values.")
else:
    print("The column 'product_name' has no null values.")

In [4]:
ordered_products = df.select(df['product_name'], df['order_id'])
ordered_products.show(3)

+--------------------+--------+
|        product_name|order_id|
+--------------------+--------+
|  Organic Egg Whites|       2|
|Michigan Organic ...|       2|
|       Garlic Powder|       2|
+--------------------+--------+
only showing top 3 rows



In [None]:
ordered_products.printSchema()

In [5]:
# Group by 'order_id' and collect product names into a list
grouped_order_id = ordered_products.groupBy("order_id").agg(F.collect_set("product_name").alias("products"))


In [None]:
grouped_order_id.show(13)

In [None]:
# Fetch the first row from the DataFrame
first_row = grouped_order_id.select(grouped_order_id.products).first()

# Print the value of the "products" column
print(first_row['products'])


In [None]:
grouped_order_id.select('order_id').distinct().count()

## Map Reduce

In [6]:
# Step 1: Define a UDF to generate triplets
#The generate_triplets function creates all possible combinations of 3 products from the products column.
def generate_triplets(products):
    if len(products) < 3:
        return []  # Ignore orders with fewer than 3 products
    return list(combinations(products, 3))

triplets_udf = udf(generate_triplets, ArrayType(ArrayType(StringType())))

In [7]:
# Step 2: Apply the UDF to create a column with triplets
#
grouped_order_id_with_triplets = grouped_order_id.withColumn("triplets", triplets_udf(col("products")))

In [8]:
# Step 3: Explode triplets into separate rows
# explode(col("triplets")) flattens the array of triplets into individual rows, where each row represents one triplet.
triplets_exploded = grouped_order_id_with_triplets.select(explode(col("triplets")).alias("triplet"))

In [9]:
# Step 4: Count occurrences of each triplet
# groups identical triplets and counts their occurrences across all orders.
triplet_counts = triplets_exploded.groupBy("triplet").count()


In [10]:
# Step 5: Sort by count in descending order
top_triplets = triplet_counts.orderBy(col("count").desc())


In [11]:
# Step 6: Get the top 3 triplets
top_3_triplets = top_triplets.limit(3)

In [12]:
# Display the results
top_3_triplets.show(truncate=False)

ERROR:root:KeyboardInterrupt while sending command.               (0 + 32) / 34]
Traceback (most recent call last):
  File "/home/thi/Big_Data/big_data_venv/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/thi/Big_Data/big_data_venv/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [13]:
top_3_triplets.write.csv("output/top_3_triplets.csv", header=True)




AnalysisException: [UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE] The CSV datasource doesn't support the column `triplet` of the type "ARRAY<STRING>".

In [None]:
from pyspark.sql.functions import concat_ws

# Convert the array column "triplet" into a string column
flattened_triplets = top_3_triplets.withColumn("triplet", concat_ws(",", "triplet"))

# Save the result to a CSV file
flattened_triplets.write.csv("output/top_3_triplets.csv", header=True)


In [14]:
# Split the array into separate columns
split_triplets = top_3_triplets.selectExpr(
    "triplet[0] as product1", 
    "triplet[1] as product2", 
    "triplet[2] as product3", 
    "count"
)

# Save to CSV
split_triplets.write.csv("output/top_3_triplets.csv", header=True)


                                                                                