# Transforming Data Using PySpark for AWS Glue

## First Import SparkSession

In [1]:
from pyspark.sql import SparkSession

## Then Create a Spark Session

In [5]:
spark = SparkSession.builder.appName("Airbnb_Warehousing").getOrCreate()

## Read the Listings CSV File

In [6]:
listing_df = spark.read\
    .format("csv")\
    .option("multiline", "true")\
    .option("quote", "\"")\
    .option("header", "true")\
    .option("escape", "\\")\
    .option("escape", "\"")\
    .option("sep", ",")\
    .option("inferSchema", "true")\
    .load("../data/listings.csv")

In [7]:
listing_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: int

### Looks Like the DataFrame was Correctly Read

In [8]:
listing_df.show(2)

24/06/18 13:13:24 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+------+--------------------+--------------+------------+-----------+--------------------+-----------+---------------------+--------------------+-------+--------------------+-------------+----------+-------------+--------------------+------------------+------------------+--------------------+-----------------+--------------------+--------------------+------------------+-------------------+-------------------------+--------------------+--------------------+----------------------+--------------------+----------------------+----------------------------+--------+---------+--------------------+---------------+------------+---------+--------------+--------+----+---------+-------+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+----------------+---------------+---------------+---------------+----------------+---------------------+-----------------+-----------

# Separating Tables

## Dropping Redundant/Empty Columns

In [11]:
listing_df = listing_df.drop(*["calendar_last_scraped", "description", "calendar_updated", "bedrooms", "bathrooms", "neighbourhood_group_cleansed", "amenities"])

## Renaming Misspell

In [18]:
listing_df = listing_df.withColumnRenamed("neighborhood_overview", "neighbourhood_overview")

In [19]:
# Looks like the rows have been dropped
listing_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- neighbourhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: integer (nullable = true)
 |-- host_total_lis

## Creating Host Tables

### Host Table

In [26]:
host_df = listing_df.select(
    "host_id",
    "host_url",
    "host_name",
    "host_since",
    "host_location",
    "host_about",
    "host_thumbnail_url",
    "host_picture_url",
    "host_neighbourhood",
    "host_response_time",
    "host_response_rate",
    "host_acceptance_rate",
    "host_is_superhost",
    "host_listings_count",
    "host_total_listings_count",
    "host_verifications",
    "host_has_profile_pic",
    "host_identity_verified",
    "calculated_host_listings_count",
    "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms",
    "calculated_host_listings_count_shared_rooms"
)

host_df.printSchema()
host_df.show(2)

root
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_listings_count: integer (nullable = true)
 |-- host_total_listings_count: integer (nullable = true)
 |-- host_verifications: string (nullable = true)
 |-- host_has_profile_pic: string (nullable = true)
 |-- host_identity_verified: string (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- calculated_host_listings_count_entire_homes: integer (nullable =

### Host Dimension Tables

In [23]:
# Host Qualifications and Diagnostics (HQAD)
hqad_df = host_df.select(
    "host_response_time",
    "host_response_rate",
    "host_acceptance_rate",
    "host_is_superhost",
    "host_listings_count",
    "host_total_listings_count",
    "host_verifications",
    "host_has_profile_pic",
    "host_identity_verified"
)

hqad_df.printSchema()
hqad_df.show(2)

root
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_listings_count: integer (nullable = true)
 |-- host_total_listings_count: integer (nullable = true)
 |-- host_verifications: string (nullable = true)
 |-- host_has_profile_pic: string (nullable = true)
 |-- host_identity_verified: string (nullable = true)



[Stage 5:>                                                          (0 + 1) / 1]

+------------------+------------------+--------------------+-----------------+-------------------+-------------------------+--------------------+--------------------+----------------------+
|host_response_time|host_response_rate|host_acceptance_rate|host_is_superhost|host_listings_count|host_total_listings_count|  host_verifications|host_has_profile_pic|host_identity_verified|
+------------------+------------------+--------------------+-----------------+-------------------+-------------------------+--------------------+--------------------+----------------------+
|    within an hour|              100%|                100%|                t|                  3|                        3|['email', 'phone'...|                   t|                     t|
|    within an hour|               90%|                100%|                t|                  3|                        4|['email', 'phone'...|                   t|                     f|
+------------------+------------------+-----------

                                                                                

In [24]:
# The Host Listings Diagnostics (HLD)
host_ld_df = host_df.select(
    "calculated_host_listings_count",
    "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms",
    "calculated_host_listings_count_shared_rooms"
)

host_ld_df.printSchema()
host_ld_df.show(2)

root
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- calculated_host_listings_count_entire_homes: integer (nullable = true)
 |-- calculated_host_listings_count_private_rooms: integer (nullable = true)
 |-- calculated_host_listings_count_shared_rooms: integer (nullable = true)

+------------------------------+-------------------------------------------+--------------------------------------------+-------------------------------------------+
|calculated_host_listings_count|calculated_host_listings_count_entire_homes|calculated_host_listings_count_private_rooms|calculated_host_listings_count_shared_rooms|
+------------------------------+-------------------------------------------+--------------------------------------------+-------------------------------------------+
|                             3|                                          3|                                           0|                                          0|
|                             3|      

### Transforming Host-Related Tables

##### Column Checklist  
host_id  
host_url  
host_name  
host_since  
host_location  
host_about  
host_thumbnail_url  
host_picture_url  
host_neighbourhood  
host_response_time  
host_response_rate  
host_acceptance_rate  
host_is_superhost  
host_listings_count  
host_total_listings_count  
host_verifications  
host_has_profile_pic  
host_identity_verified  
calculated_host_listings_count  
calculated_host_listings_count_entire_homes  
calculated_host_listings_count_private_rooms  
calculated_host_listings_count_shared_rooms  
  