## Task 1: Spark SQL (15m)

In [None]:
# Setup Spark
# ===============
# Installing Spark needs to be done once each time you re-open this notebook. It should take around 10-30 seconds.
# ===============
# install java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz

# unzip the spark file to the current folder
!tar xf spark-3.3.2-bin-hadoop3.tgz

# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"

# install findspark using pip
!pip install -q findspark
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
# After downloading dataset, you should have the files in your Files (click the folder icon in the left sidebar)
!wget -O Products_table.csv https://drive.google.com/uc?id=1FG0rGWSPWALcmFo3feHUF5TK5AP7mMwH&export=download #products
!wget -O Sales_table.csv https://drive.google.com/uc?id=1l1fr_s67JjGGsXt3fIz_769pKPZg-jhU&export=download #sales
!wget -O Sellers_table.csv https://drive.google.com/uc?id=1YTTYU5Cwgvau1Z7b1ShmcIhrO3VN-Zhq&export=download #sellers

--2023-03-05 13:54:59--  https://drive.google.com/uc?id=1FG0rGWSPWALcmFo3feHUF5TK5AP7mMwH
Resolving drive.google.com (drive.google.com)... 142.250.157.102, 142.250.157.113, 142.250.157.100, ...
Connecting to drive.google.com (drive.google.com)|142.250.157.102|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0g-a0-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/iiqq3kg3gbbak15imo3hnjmgt2b8pqo2/1678024500000/06948221057362969045/*/1FG0rGWSPWALcmFo3feHUF5TK5AP7mMwH?uuid=7a2452d8-8477-404c-9d01-213eccf9fc99 [following]
--2023-03-05 13:55:00--  https://doc-0g-a0-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/iiqq3kg3gbbak15imo3hnjmgt2b8pqo2/1678024500000/06948221057362969045/*/1FG0rGWSPWALcmFo3feHUF5TK5AP7mMwH?uuid=7a2452d8-8477-404c-9d01-213eccf9fc99
Resolving doc-0g-a0-docs.googleusercontent.com (doc-0g-a0-docs.googleusercontent.com)... 142.250.157.132, 2404:6800:4008:c13::84
Connecting to d

In [None]:
# read csv files into dataframes, you can work with the 3 tables after running this code
products_table = spark.read.option('header', True).option('inferSchema', True).csv("/content/Products_table.csv").repartition(1).cache()
sales_table = spark.read.option('header', True).option('inferSchema', True).csv("/content/Sales_table.csv").repartition(1).cache()
sellers_table = spark.read.option('header', True).option('inferSchema', True).csv("/content/Sellers_table.csv").repartition(1).cache()

In [None]:
# (a) Output the top 3 most popular products sold among all sellers [2m]
# Your table should have 1 column(s): [product_name]

In [None]:
# (b) Find out the total sales of the products sold by sellers 1 to 10 and output the top most sold product [2m]
# Your table should have 1 column(s): [product_name]

In [None]:
# (c) Compute the combined revenue earned from sellers where seller_id ranges from 1 to 500 inclusive. [3m]
# Your table should have 1 column(s): [total_revenue]


In [None]:
# (d) Among sellers with rating >= 4 who have achieved a combined number of products sold >= 3000, find out the top 10 most expensive product sold by any of the sellers. (If there are multiple products at the same price, please sort them in ascending order of product_id) [8m]
# Your table should have 1 column(s): [product_name]
# To get the full mark, your query should not run for more than 1 min



## Task 2: Spark ML (10m)

In [None]:
# Setup Spark
# ===============
# Installing Spark needs to be done once each time you re-open this notebook. It should take around 10-30 seconds.
# ===============
# install java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz

# unzip the spark file to the current folder
!tar xf spark-3.3.2-bin-hadoop3.tgz

# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"

# install findspark using pip
!pip install -q findspark
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
# After downloading dataset, you should have the files in your Files (click the folder icon in the left sidebar)
!wget -O bank_train.csv https://drive.google.com/uc?id=1kEP94BfULB3gUMl_IQCg9wuX4IJRajMC&export=download #products
!wget -O bank_test.csv https://drive.google.com/uc?id=1EqX4liL5iWbwqyJ_lFaYvYZvgBFwpwSJ&export=download #bank_test

--2023-03-05 13:59:46--  https://drive.google.com/uc?id=1kEP94BfULB3gUMl_IQCg9wuX4IJRajMC
Resolving drive.google.com (drive.google.com)... 142.251.8.102, 142.251.8.113, 142.251.8.100, ...
Connecting to drive.google.com (drive.google.com)|142.251.8.102|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0c-a8-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/6a3e19tqodgib3pccobtlthqmdiclhtv/1678024725000/08487103376102314083/*/1kEP94BfULB3gUMl_IQCg9wuX4IJRajMC?uuid=d11faedd-8c48-4734-aa0b-18efe2a56d2e [following]
--2023-03-05 13:59:47--  https://doc-0c-a8-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/6a3e19tqodgib3pccobtlthqmdiclhtv/1678024725000/08487103376102314083/*/1kEP94BfULB3gUMl_IQCg9wuX4IJRajMC?uuid=d11faedd-8c48-4734-aa0b-18efe2a56d2e
Resolving doc-0c-a8-docs.googleusercontent.com (doc-0c-a8-docs.googleusercontent.com)... 142.250.157.132, 2404:6800:4008:c13::84
Connecting to doc-0c-a8

In [None]:
bank_train_location = "/content/bank_train.csv"
bank_test_location = "/content/bank_test.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
bank_train = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(bank_train_location)

bank_test = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(bank_test_location)

Build ML model to predict whether the customer will subscribe bank deposit service or not. Train the model using training set and evaluate the model performance (e.g. accuracy) using testing set.


*   You can explore different methods to pre-process the data and select proper features
*   You can explore different methods to pre-process the data and select proper features
*   Present the final testing accuracy.

In [None]:
# data preparation (4m)


In [None]:
# model building (4m)

In [None]:
# model evaluation (2m)