## Task 1: Spark SQL (15m)

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
sales_file_location = "/FileStore/tables/Sales_table.csv"
products_file_location = "/FileStore/tables/Products_table.csv"
sellers_file_location = "/FileStore/tables/Sellers_table.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
products_table = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(products_file_location)

sales_table = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(sales_file_location)

sellers_table = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(sellers_file_location)

In [0]:
products_table.show()
sales_table.show()
sellers_table.show()


+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         1|   product_1|   24|
|         2|   product_2|  173|
|         3|   product_3|  147|
|         4|   product_4|  116|
|         5|   product_5|   13|
|         6|   product_6|  146|
|         7|   product_7|  121|
|         8|   product_8|   21|
|         9|   product_9|   70|
|        10|  product_10|   28|
|        11|  product_11|  113|
|        12|  product_12|  105|
|        13|  product_13|  133|
|        14|  product_14|   75|
|        15|  product_15|  113|
|        16|  product_16|   98|
|        17|  product_17|   17|
|        18|  product_18|  177|
|        19|  product_19|   29|
|        20|  product_20|  163|
+----------+------------+-----+
only showing top 20 rows

+--------+----------+---------+-----------------+
|order_id|product_id|seller_id|num_of_items_sold|
+--------+----------+---------+-----------------+
|       1|      1841|    35172|              432|
|     

In [0]:
products_table.show()

+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         1|   product_1|   24|
|         2|   product_2|  173|
|         3|   product_3|  147|
|         4|   product_4|  116|
|         5|   product_5|   13|
|         6|   product_6|  146|
|         7|   product_7|  121|
|         8|   product_8|   21|
|         9|   product_9|   70|
|        10|  product_10|   28|
|        11|  product_11|  113|
|        12|  product_12|  105|
|        13|  product_13|  133|
|        14|  product_14|   75|
|        15|  product_15|  113|
|        16|  product_16|   98|
|        17|  product_17|   17|
|        18|  product_18|  177|
|        19|  product_19|   29|
|        20|  product_20|  163|
+----------+------------+-----+
only showing top 20 rows



In [0]:
# (a) Output the top 3 most popular products sold among all sellers [2m]
# Your table should have 1 column(s): [product_name]
# from pyspark.sql.functions import col

top_three_product_id_df = sales_table.groupBy("product_id")\
    .count()\
    .orderBy(col("count").desc(), col("product_id").asc())\
    .limit(3)\

top_three_product_name = products_table.join(top_three_product_id_df, "product_id")\
    .select(col("product_name"))\
    .show()




+-------------+
| product_name|
+-------------+
| product_8031|
|product_22622|
|product_99849|
+-------------+



In [0]:
# (b) Find out the total sales of the products sold by sellers 1 to 10 and output the top most sold product [2m]
# Your table should have 1 column(s): [product_name]

sales_of_sellers_1_to_10 = sales_table.filter((col("seller_id") >= 1) & (col("seller_id") <= 10))

sum_of_sales_for_products_sold_in_desc = sales_of_sellers_1_to_10.groupBy("product_id")\
    .sum("num_of_items_sold")\
    .orderBy(col("sum(num_of_items_sold)").desc())\
    .limit(1)

product_name_of_top_most_sold_product = sum_of_sales_for_products_sold_in_desc.join(products_table, "product_id")\
    .select(col("product_name"))\
    .show()

+-------------+
| product_name|
+-------------+
|product_36658|
+-------------+



In [0]:
# (c) Compute the combined revenue earned from sellers where seller_id ranges from 1 to 500 inclusive. [3m]
# Your table should have 1 column(s): [total_revenue]

sales_of_seller_from_1_to_500 = sales_table.filter((col("seller_id") >= 1) & (col("seller_id") <= 500))

combined_sales_by_product_id_from_seller_1_to_500 = sales_of_seller_from_1_to_500.groupBy("product_id")\
    .sum("num_of_items_sold")\

sales_of_product_and_quantity = combined_sales_by_product_id_from_seller_1_to_500.join(products_table, "product_id")\
    .select(col("sum(num_of_items_sold)"), col("price"))

def calculate_revenue(row):
    revenue = row["sum(num_of_items_sold)"] * row["price"]
    return revenue

total_revenue = sc.accumulator(0)

def update_total_revenue(total_revenue, row):
    total_revenue.add(calculate_revenue(row))

sales_of_product_and_quantity.foreach(lambda row: update_total_revenue(total_revenue, row))

spark.createDataFrame([(total_revenue.value,)], ['total_revenue']).show()

+-------------+
|total_revenue|
+-------------+
|    160916699|
+-------------+



In [0]:
# (d) Among sellers with rating >= 4 who have achieved a combined number of products sold >= 3000, find out the top 10 most expensive product sold by any of the sellers. (If there are multiple products at the same price, please sort them in ascending order of product_id) [8m]
# Your table should have 1 column(s): [product_name]
# To get the full mark, your query should not run for more than 1 min

sellers_with_rating_from_4 = sellers_table.filter(col("rating") >= 4)

sales_of_sellers_with_rating_from_4 = sellers_with_rating_from_4.join(sales_table, "seller_id")\
    .groupBy("seller_id")\
    .sum("num_of_items_sold")

sellers_with_rating_from_4_and_sold_from_3000 = sales_of_sellers_with_rating_from_4.filter(col("sum(num_of_items_sold)") >= 3000)\
    .select(col("seller_id"))

products_sold_by_filtered_sellers = sellers_with_rating_from_4_and_sold_from_3000.join(sales_table, "seller_id")\
    .select(col("product_id"))

products_sold_with_price = products_sold_by_filtered_sellers.join(products_table, "product_id")\
    .dropDuplicates(["product_id"])\
    .orderBy(col("price").desc(), col("product_id").asc())\
    .select(col("product_name"))\
    .limit(10)\
    .show()

+------------+
|product_name|
+------------+
| product_106|
| product_117|
| product_363|
| product_712|
| product_843|
| product_897|
| product_923|
|product_1466|
|product_1507|
|product_1514|
+------------+



## Task 2: Spark ML (10m)

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
bank_train_location = "/FileStore/tables/bank_train.csv"
bank_test_location = "/FileStore/tables/bank_test.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
bank_train = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(bank_train_location)

bank_test = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(bank_test_location)

Build ML model to predict whether the customer will subscribe bank deposit service or not. Train the model using training set and evaluate the model performance (e.g. accuracy) using testing set. 
* You can explore different methods to pre-process the data and select proper features
* You can utilize different machine learning models and tune model hyperparameters
* Present the final testing accuracy.

In [0]:
# data preparation (4m)
# Import necessary libraries
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

# Define the interested categorical and numerical columns
# cat_cols = ["job", "marital", "education", "housing", "loan", "poutcome"]
# num_cols = ["age", "balance"]
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
num_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Convert categorical columns to numerical using OneHotEncoder
cat_indexers = [StringIndexer(inputCol=col, outputCol=col + "_index") for col in cat_cols]
cat_encoders = [OneHotEncoder(inputCol=col + "_index", outputCol=col + "_vec") for col in cat_cols]

# Assemble all features into a single vector column
assembler = VectorAssembler(inputCols=num_cols + [col + "_vec" for col in cat_cols], outputCol="features")

In [0]:

# model building (4m)
# Define Random Forest Classifier model
rf = RandomForestClassifier(featuresCol="features", labelCol="label")

# Define pipeline with all stages
pipeline = Pipeline(stages=cat_indexers + cat_encoders + [assembler, rf])

# Fit pipeline on training data
model = pipeline.fit(bank_train)

In [0]:
# model evaluation (2m)
# Predict on test data
predictions = model.transform(bank_test)

# Evaluate the model using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")
auc = evaluator.evaluate(predictions)

print("AUC on test data = {}".format(auc))

AUC on test data = 0.8990509841317872
