In [0]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("").getOrCreate()

In [0]:
# Drop the sales_data table if it already exists
spark.sql("DROP TABLE IF EXISTS sales_data")
    # Drop the sales_data table if it already exists
spark.sql("DROP TABLE IF EXISTS product_data")

Out[32]: DataFrame[]

In [0]:
def sales_data_collector_api(spark, text_file_path):
    # Read the sales data from the text file with headers and '|' delimiter
    df_sales = spark.read.option("delimiter", "|").option("header", "true").csv(text_file_path)
    
    # Write the sales data to a partitioned Hive table (by sale_date) in Parquet format
    df_sales.write.partitionBy("sale_date").format("parquet").saveAsTable("sales_data")
    
    return "sales_data"  

# Call the function to process the sales data
sales_data_collector_api(spark, "dbfs:/FileStore/shared_uploads/timilsina.ra@northeastern.edu/sales_data-5.txt")





Out[33]: 'sales_data'

In [0]:
def product_data_collector_api(spark, text_file_path):
    # Read the product data from the text file with headers and '|' delimiter
    df_product = spark.read.option("delimiter", "|").option("header", "true").csv(text_file_path)
    
    # Convert the DataFrame into Parquet format and save it to a Hive table
    df_product.write.format("parquet").saveAsTable("product_data")
    
    return "product_data"  

# Call the function to process the product data
product_data_collector_api(spark, "dbfs:/FileStore/shared_uploads/timilsina.ra@northeastern.edu/product_data.txt")


Out[34]: 'product_data'

In [0]:
def data_preparation_api(spark, product_hive_table, sales_hive_table, target_hive_table):
    # Load the product and sales data from Hive
    df_product = spark.table(product_hive_table)
    df_sales = spark.table(sales_hive_table)
    
    # Get product IDs for S8 and iPhone
    s8_product_id = df_product.filter(df_product.product_name == "S8").select("product_id").collect()[0][0]
    iphone_product_id = df_product.filter(df_product.product_name == "iPhone").select("product_id").collect()[0][0]
    
    # Get the list of buyers who bought S8
    s8_buyers = df_sales.filter(df_sales.product_id == s8_product_id).select("buyer_id").distinct()
    
    # Get the list of buyers who bought iPhone
    iphone_buyers = df_sales.filter(df_sales.product_id == iphone_product_id).select("buyer_id").distinct()
    
    # Find buyers who bought S8 but not iPhone (anti join)
    result_df = s8_buyers.join(iphone_buyers, on="buyer_id", how="left_anti")
    
    # Write the result into the target Hive table
   # Write the result into the target Hive table, overwriting the existing table
    result_df.write.format("parquet").mode("overwrite").saveAsTable(target_hive_table)


    return target_hive_table 

# Prepare the data (buyers who bought S8 but not iPhone)
data_preparation_api(spark, "product_data", "sales_data", "buyers_s8_not_iphone")

Out[35]: 'buyers_s8_not_iphone'

In [0]:
spark.sql("SELECT * FROM buyers_s8_not_iphone").show()


+--------+
|buyer_id|
+--------+
|       1|
+--------+

