# Transforming Data Using PySpark for AWS Glue

## First Import SparkSession

In [1]:
from pyspark.sql import SparkSession

## Then Create a Spark Session

In [7]:
spark = SparkSession.builder.appName("Airbnb_Warehousing").getOrCreate()

## Read the Listings CSV File

In [3]:
listing_df = spark.read\
    .format("csv")\
    .option("multiline", "true")\
    .option("quote", "\"")\
    .option("header", "true")\
    .option("escape", "\\")\
    .option("escape", "\"")\
    .option("sep", ",")\
    .option("inferSchema", "true")\
    .load("../data/listings.csv")

                                                                                

In [4]:
listing_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: int

### Looks Like the DataFrame was Correctly Read

In [5]:
listing_df.show(2)

24/06/25 07:42:36 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+------+--------------------+--------------+------------+-----------+--------------------+-----------+---------------------+--------------------+-------+--------------------+-------------+----------+-------------+--------------------+------------------+------------------+--------------------+-----------------+--------------------+--------------------+------------------+-------------------+-------------------------+--------------------+--------------------+----------------------+--------------------+----------------------+----------------------------+--------+---------+--------------------+---------------+------------+---------+--------------+--------+----+---------+-------+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+----------------+---------------+---------------+---------------+----------------+---------------------+-----------------+-----------

# Separating Tables

## Dropping Redundant/Empty Columns

In [8]:
columns_to_remove = [
                        "calendar_last_scraped",
                        "description",
                        "calendar_updated", 
                        "bedrooms", 
                        "bathrooms", 
                        "neighbourhood_group_cleansed", 
                        "amenities"
                    ]
listing_df = listing_df.drop(*columns_to_remove)

## Correcting Misspell

In [9]:
listing_df = listing_df.withColumnRenamed("neighborhood_overview", "neighbourhood_overview")

In [10]:
# Looks like the rows have been dropped
listing_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- neighbourhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: integer (nullable = true)
 |-- host_total_lis

## Creating Host Tables

### Host Table

In [11]:
host_df = listing_df.select(
    "host_id",
    "host_url",
    "host_name",
    "host_since",
    "host_location",
    "host_about",
    "host_thumbnail_url",
    "host_picture_url",
    "host_neighbourhood",
    "host_response_time",
    "host_response_rate",
    "host_acceptance_rate",
    "host_is_superhost",
    "host_listings_count",
    "host_total_listings_count",
    "host_verifications",
    "host_has_profile_pic",
    "host_identity_verified",
    "calculated_host_listings_count",
    "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms",
    "calculated_host_listings_count_shared_rooms"
)

host_df = host_df.dropDuplicates()

print("count: " + str(host_df.count()))

host_df.show(2)

                                                                                

count: 977
+---------+--------------------+---------+----------+-------------+--------------------+--------------------+--------------------+------------------+------------------+------------------+--------------------+-----------------+-------------------+-------------------------+------------------+--------------------+----------------------+------------------------------+-------------------------------------------+--------------------------------------------+-------------------------------------------+
|  host_id|            host_url|host_name|host_since|host_location|          host_about|  host_thumbnail_url|    host_picture_url|host_neighbourhood|host_response_time|host_response_rate|host_acceptance_rate|host_is_superhost|host_listings_count|host_total_listings_count|host_verifications|host_has_profile_pic|host_identity_verified|calculated_host_listings_count|calculated_host_listings_count_entire_homes|calculated_host_listings_count_private_rooms|calculated_host_listings_count_sha

### Host Dimension Tables

In [12]:
# Host Qualifications and Diagnostics (HQAD)
hqad_df = host_df.select(
    "host_response_time",
    "host_response_rate",
    "host_acceptance_rate",
    "host_is_superhost",
    "host_listings_count",
    "host_total_listings_count",
    "host_verifications",
    "host_has_profile_pic",
    "host_identity_verified"
)

hqad_df = hqad_df.dropDuplicates()

print("count: " + str(hqad_df.count()))
hqad_df.printSchema()
hqad_df.show(2)

                                                                                

count: 609
root
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_listings_count: integer (nullable = true)
 |-- host_total_listings_count: integer (nullable = true)
 |-- host_verifications: string (nullable = true)
 |-- host_has_profile_pic: string (nullable = true)
 |-- host_identity_verified: string (nullable = true)

+------------------+------------------+--------------------+-----------------+-------------------+-------------------------+------------------+--------------------+----------------------+
|host_response_time|host_response_rate|host_acceptance_rate|host_is_superhost|host_listings_count|host_total_listings_count|host_verifications|host_has_profile_pic|host_identity_verified|
+------------------+------------------+--------------------+-----------------+-------------------+-------------------------+----------------

In [13]:
# The Host Listings Diagnostics (HLD)
hld_df = host_df.select(
    "calculated_host_listings_count",
    "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms",
    "calculated_host_listings_count_shared_rooms"
)

hld_df = hld_df.dropDuplicates()

print("count: " + str(hld_df.count()))

hld_df.printSchema()
hld_df.show(2)

count: 69
root
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- calculated_host_listings_count_entire_homes: integer (nullable = true)
 |-- calculated_host_listings_count_private_rooms: integer (nullable = true)
 |-- calculated_host_listings_count_shared_rooms: integer (nullable = true)

+------------------------------+-------------------------------------------+--------------------------------------------+-------------------------------------------+
|calculated_host_listings_count|calculated_host_listings_count_entire_homes|calculated_host_listings_count_private_rooms|calculated_host_listings_count_shared_rooms|
+------------------------------+-------------------------------------------+--------------------------------------------+-------------------------------------------+
|                             2|                                          1|                                           1|                                          0|
|                           

## Transforming the HQAD and HLD Tables

In [16]:
# The Host Listings Diagnostics (HLD)
hld_df.createOrReplaceTempView("hld_df_view")
hld_df = spark.sql("select row_number() over (order by (select 1)) as hld_id , * from hld_df_view")

hld_df.show(5)


24/06/25 07:45:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/25 07:45:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/25 07:45:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/25 07:45:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/25 07:45:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/25 07:45:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/25 0

+------+------+------+------------------------------+-------------------------------------------+--------------------------------------------+-------------------------------------------+
|hld_id|hld_id|hld_id|calculated_host_listings_count|calculated_host_listings_count_entire_homes|calculated_host_listings_count_private_rooms|calculated_host_listings_count_shared_rooms|
+------+------+------+------------------------------+-------------------------------------------+--------------------------------------------+-------------------------------------------+
|     1|     1|     1|                             2|                                          1|                                           1|                                          0|
|     2|     2|     2|                            23|                                         23|                                           0|                                          0|
|     3|     3|     3|                             2|            

In [48]:
hqad_df.createOrReplaceTempView("hqad_df_view")

+--------------------------------+-------------------------+
|host_verifications              |count(host_verifications)|
+--------------------------------+-------------------------+
|['phone']                       |55                       |
|['email', 'phone']              |451                      |
|['email', 'phone', 'work_email']|98                       |
|[]                              |1                        |
|['phone', 'work_email']         |4                        |
+--------------------------------+-------------------------+



In [57]:
# The Host Qualifications and Diagnostics (HQAD)
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf, col


# User defined function to change the host verifications column. Three numbers indicate whether or not the host has the verification.
# 111 Means host has all three verifications. 110 means the host has email and phone verified and 010 means they have their phone verified.
def hqad_df_host_verifications_transform(verifs_list):
   res = ""

   if "email" in verifs_list:
      res+="1"
   else:
      res+="0"
   if "phone" in verifs_list:
      res += "1"
   else:
      res += "0"
   if "work_email" in verifs_list:
      res += "1"
   else:
      res += "0"

   return res


hqad_array_transform = udf(hqad_df_host_verifications_transform, StringType())

# Query to get row number, replace percent to decimal, change host_response_time to shorter key, and change t/f to 1/0
hqad_df_query = """
select row_number() over (order by (select 1)) as hqad_id,
       case host_response_time
         when 'within an hour' then 'H'
         when 'within a few hours' then 'FH'
         when 'within a dat' then 'D'
         when 'a few days or more' then 'D+'
       else
         null
       end as host_response_time, 
       cast(replace(host_response_rate, '%', '') / 100 as decimal(3,2)) as host_response_rate,
       cast(replace(host_acceptance_rate, '%', '') / 100 as decimal(3,2)) as host_acceptance_rate,
       case host_is_superhost
          when 't' then 1
          when 'f' then 0
          else null
       end as host_is_superhost, 
       host_listings_count, 
       host_total_listings_count, 
       host_verifications, 
       case host_has_profile_pic 
          when 't' then 1
          when 'f' then 0
          else null
       end as host_has_profile_pic, 
       case host_identity_verified
          when 't' then 1
          when 'f' then 0
          else null
       end as host_identity_verified
from hqad_df_view
"""

hqad_df = spark.sql(hqad_df_query)
hqad_df = hqad_df.withColumn("host_verifications", hqad_array_transform(col("host_verifications")))
hqad_df.show(10)

24/06/24 14:03:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/24 14:03:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/24 14:03:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/24 14:03:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/24 14:03:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/24 14:03:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/24 1

+-------+------------------+------------------+--------------------+-----------------+-------------------+-------------------------+------------------+--------------------+----------------------+
|hqad_id|host_response_time|host_response_rate|host_acceptance_rate|host_is_superhost|host_listings_count|host_total_listings_count|host_verifications|host_has_profile_pic|host_identity_verified|
+-------+------------------+------------------+--------------------+-----------------+-------------------+-------------------------+------------------+--------------------+----------------------+
|      1|                 H|              1.00|                0.99|                0|                  3|                        3|               110|                   0|                     0|
|      2|                 H|              1.00|                0.98|                0|                  1|                        1|               110|                   1|                     0|
|      3|           

                                                                                

### Whew! That was a Long One!

## Next, the Property Dimension Table

In [23]:
property_df = listing_df.select(
    "latitude",
    "longitude", 
    "property_type", 
    "room_type", 
    "accommodates",
    "bathrooms_text",
    "beds",
    "price"
)

property_df = property_df.dropDuplicates()

print("count: " + str(property_df.count()))

property_df.printSchema()
property_df.select("property_type").distinct().show(truncate=False)

                                                                                

count: 2649
root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- property_type: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: integer (nullable = true)
 |-- bathrooms_text: string (nullable = true)
 |-- beds: integer (nullable = true)
 |-- price: string (nullable = true)

+------------------------+
|property_type           |
+------------------------+
|Private room in loft    |
|Farm stay               |
|Entire rental unit      |
|Shared room in hostel   |
|Private room in condo   |
|Room in boutique hotel  |
|Entire guesthouse       |
|Entire cabin            |
|Private room in bungalow|
|Entire guest suite      |
|Private room in home    |
|Entire place            |
|Camper/RV               |
|Castle                  |
|Tiny home               |
|Entire vacation home    |
|Private room in hostel  |
|Room in aparthotel      |
|Entire townhouse        |
|Room in hotel           |
+------------------------+
on

In [43]:
from pyspark.sql.functions import regexp_replace, when, col, lower, instr, split
# property_df.createOrReplaceTempView("property_df_view")

property_query = """
    select row_number() over (order by (select 1)) as property_id,
           latitude,
           longitude,
           property_type,
           room_type,
           accommodates,
           CASE
              WHEN INSTR(bathrooms_text, 'Private half-bath') > 0 THEN '0.5 private bath'
              WHEN INSTR(bathrooms_text, 'Half-bath') > 0 THEN '0.5 bath'
              ELSE bathrooms_text
           END as bathrooms_text,
           beds,
           cast(replace(price, '$', '') as decimal(10,2)) as price

    from property_df_view
"""

# Running the query
property_df = spark.sql(property_query)

# Splitting the bathroom column into two
property_baths_split = split(col("bathrooms_text"), ' ', limit=2)
# Adding the two new columns
property_df = property_df.withColumn("bathrooms", property_baths_split.getItem(0))
property_df = property_df.withColumn("bathroom_desc", property_baths_split.getItem(1))


property_df.select("bathrooms", "bathrooms_text", "bathroom_desc").distinct().show(truncate=False)

[Stage 114:>                                                        (0 + 1) / 1]

+---------+----------------+-------------+
|bathrooms|bathrooms_text  |bathroom_desc|
+---------+----------------+-------------+
|2        |2 baths         |baths        |
|2        |2 shared baths  |shared baths |
|4.5      |4.5 baths       |baths        |
|6.5      |6.5 baths       |baths        |
|3        |3 baths         |baths        |
|1        |1 private bath  |private bath |
|3        |3 shared baths  |shared baths |
|8        |8 baths         |baths        |
|10.5     |10.5 baths      |baths        |
|2.5      |2.5 baths       |baths        |
|2.5      |2.5 shared baths|shared baths |
|4        |4 baths         |baths        |
|3.5      |3.5 baths       |baths        |
|0        |0 shared baths  |shared baths |
|1.5      |1.5 shared baths|shared baths |
|1        |1 shared bath   |shared bath  |
|1.5      |1.5 baths       |baths        |
|3.5      |3.5 shared baths|shared baths |
|0        |0 baths         |baths        |
|0.5      |0.5 bath        |bath         |
+---------+

                                                                                