In [1]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.1.3'

os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()



Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Get:8 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:9 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:14 http://archive.ubuntu.com/ubuntu b

In [2]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2022-09-03 01:22:34--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2022-09-03 01:22:34 (9.51 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [3]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName("kitchen_ETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [6]:
# Import Amazon Data
from pyspark import SparkFiles
url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz'
spark.sparkContext.addFile(url)

In [26]:
kitchen_df = spark.read.csv(SparkFiles.get('amazon_reviews_us_Kitchen_v1_00.tsv.gz'), sep='\t', header=True)

In [27]:
kitchen_df.show(5)

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   37000337|R3DT59XH7HXR9K|B00303FI0G|     529320574|Arthur Court Pape...|         Kitchen|          5|            0|          0|   N|                Y|Beautiful. Looks ...|Beautiful.  Looks...| 2015-08-31|
|         US|   15272914|R1LFS11BNASSU8|B00JCZKZN6|     274237558|Olde Thompson Bav...|         Kitchen|          5|    

In [28]:
# Count the number of records (rows) in the dataset.
kitchen_df.count()

4880466

### Function to set nullable states to match schema

In [56]:
import pyspark.sql.functions as F

# https://stackoverflow.com/questions/46072411/can-i-change-the-nullability-of-a-column-in-my-spark-dataframe



def set_df_columns_nullable(spark, df, column_list, nullable=True):
    for struct_field in df.schema:
        if struct_field.name in column_list:
            struct_field.nullable = nullable
    df_mod = spark.createDataFrame(df.rdd, df.schema)
    return df_mod

In [18]:
# Transform the dataset to fit the tables in the schema file. Be sure the DataFrames match in data type and in column name.

# Schema: 
# CREATE TABLE review_id_table (
#   review_id TEXT PRIMARY KEY NOT NULL,
#   customer_id INTEGER,
#   product_id TEXT,
#   product_parent INTEGER,
#   review_date DATE -- this should be in the formate yyyy-mm-dd
# );



+--------------+-----------+----------+--------------+-----------+
|     review_id|customer_id|product_id|product_parent|review_date|
+--------------+-----------+----------+--------------+-----------+
|R3DT59XH7HXR9K|   37000337|B00303FI0G|     529320574| 2015-08-31|
|R1LFS11BNASSU8|   15272914|B00JCZKZN6|     274237558| 2015-08-31|
+--------------+-----------+----------+--------------+-----------+
only showing top 2 rows



In [29]:
# Check datatypes
kitchen_df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: string (nullable = true)
 |-- total_votes: string (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)



In [41]:
# Convert columns in root dataframe to schema datatype
from pyspark.sql.types import IntegerType,DateType
kitchen_df = kitchen_df.withColumn('customer_id', kitchen_df.customer_id.cast(IntegerType()))
kitchen_df = kitchen_df.withColumn('product_parent', kitchen_df.product_parent.cast(IntegerType()))
kitchen_df = kitchen_df.withColumn('review_date', kitchen_df.review_date.cast(DateType()))

# vine
kitchen_df = kitchen_df.withColumn('star_rating', kitchen_df.star_rating.cast(IntegerType()))
kitchen_df = kitchen_df.withColumn('helpful_votes', kitchen_df.helpful_votes.cast(IntegerType()))
kitchen_df = kitchen_df.withColumn('total_votes', kitchen_df.total_votes.cast(IntegerType()))

In [42]:
kitchen_df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: date (nullable = true)



### review_id_table table

In [33]:
kitchen_review_id_df = kitchen_df.select(['review_id','customer_id','product_id','product_parent','review_date']).dropna()
kitchen_review_id_df.show(2)

+--------------+-----------+----------+--------------+-----------+
|     review_id|customer_id|product_id|product_parent|review_date|
+--------------+-----------+----------+--------------+-----------+
|R3DT59XH7HXR9K|   37000337|B00303FI0G|     529320574| 2015-08-31|
|R1LFS11BNASSU8|   15272914|B00JCZKZN6|     274237558| 2015-08-31|
+--------------+-----------+----------+--------------+-----------+
only showing top 2 rows



In [57]:
# fix nullability
kitchen_review_id_df = set_df_columns_nullable(spark, kitchen_review_id_df, ['review_id'], False)

kitchen_review_id_df.printSchema()

root
 |-- review_id: string (nullable = false)
 |-- customer_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- review_date: date (nullable = true)



### products table

In [37]:
kitchen_products_df = kitchen_df.select(['product_id','product_title']).dropna()
kitchen_products_df.show(2)

+----------+--------------------+
|product_id|       product_title|
+----------+--------------------+
|B00303FI0G|Arthur Court Pape...|
|B00JCZKZN6|Olde Thompson Bav...|
+----------+--------------------+
only showing top 2 rows



In [58]:
# fix nullability
kitchen_products_df = set_df_columns_nullable(spark, kitchen_products_df, ['product_id'], False)

kitchen_products_df.printSchema()

root
 |-- product_id: string (nullable = false)
 |-- product_title: string (nullable = true)



### customers table

In [47]:
kitchen_customers_df = kitchen_df.select(['customer_id'])

In [48]:
kitchen_customers_df = kitchen_customers_df.groupBy('customer_id').count()
kitchen_customers_df.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|   23042837|    1|
|   48875707|    2|
|   48088660|    1|
|   20781887|    6|
|    1075003|    1|
|   28377689|    3|
|    8338749|    1|
|   39271457|    6|
|   38209321|    5|
|   51397605|    1|
|   45337932|    7|
|   12948675|    3|
|   38892468|    1|
|   43582450|    1|
|   13156755|    1|
|   13433330|    1|
|   17067926|    7|
|   13780617|    1|
|   50607818|    1|
|   33437759|    2|
+-----------+-----+
only showing top 20 rows



In [49]:
kitchen_customers_df = kitchen_customers_df.withColumnRenamed('count','customer_count')
kitchen_customers_df.show()

+-----------+--------------+
|customer_id|customer_count|
+-----------+--------------+
|   23042837|             1|
|   48875707|             2|
|   48088660|             1|
|   20781887|             6|
|    1075003|             1|
|   28377689|             3|
|    8338749|             1|
|   39271457|             6|
|   38209321|             5|
|   51397605|             1|
|   45337932|             7|
|   12948675|             3|
|   38892468|             1|
|   43582450|             1|
|   13156755|             1|
|   13433330|             1|
|   17067926|             7|
|   13780617|             1|
|   50607818|             1|
|   33437759|             2|
+-----------+--------------+
only showing top 20 rows



In [59]:
# fix nullability
kitchen_customers_df = set_df_columns_nullable(spark, kitchen_customers_df, ['customer_id'], False)
kitchen_customers_df = set_df_columns_nullable(spark, kitchen_customers_df, ['customer_count'])

kitchen_customers_df.printSchema()

root
 |-- customer_id: integer (nullable = false)
 |-- customer_count: long (nullable = true)



### vine table

In [53]:
kitchen_vine_df = kitchen_df.select(['review_id','star_rating','helpful_votes','total_votes','vine'])
kitchen_vine_df.show(5)

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R3DT59XH7HXR9K|          5|            0|          0|   N|
|R1LFS11BNASSU8|          5|            0|          1|   N|
|R296RT05AG0AF6|          5|            0|          0|   N|
|R3V37XDZ7ZCI3L|          5|            0|          1|   N|
|R14GU232NQFYX2|          5|            0|          0|   N|
+--------------+-----------+-------------+-----------+----+
only showing top 5 rows



In [60]:
# fix nullability
kitchen_vine_df = set_df_columns_nullable(spark, kitchen_vine_df, ['review_id'], False)

kitchen_vine_df.printSchema()

root
 |-- review_id: string (nullable = false)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)



In [63]:
# Load the DataFrames that correspond to tables into an RDS instance. 
# Note: This process can take up to 10 minutes for each. Be sure that everything is correct before uploading.
# from config import aws_url, aws_pw
aws_url = 'http'
aws_pw = 'pw'

mode = "append"
jdbc_url=f"jdbc:postgresql://{aws_url}:5432/postgres"
config = {"user":"postgres", 
          "password": aws_pw, 
          "driver":"org.postgresql.Driver"}

In [None]:
df_list =    [kitchen_review_id_df, kitchen_products_df, kitchen_customers_df, kitchen_vine_df]
rds_tables = ['review_id_table',    'products',           'customers',         'vine_table']

for i in range(len(df_list)):
  df_list[i].write.jdbc(url=jdbc_url, table=rds_tables[i], mode=mode, properties=config)