# Transforming Data Using PySpark for AWS Glue

## First Import SparkSession

In [53]:
from pyspark.sql import SparkSession

## Then Create a Spark Session

In [54]:
spark = SparkSession.builder.appName("Airbnb_Warehousing").getOrCreate()

## Read the Listings CSV File

In [55]:
listing_df = spark.read\
    .format("csv")\
    .option("multiline", "true")\
    .option("quote", "\"")\
    .option("header", "true")\
    .option("escape", "\\")\
    .option("escape", "\"")\
    .option("sep", ",")\
    .option("inferSchema", "true")\
    .load("../data/listings.csv")

In [56]:
listing_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: int

### Looks Like the DataFrame was Correctly Read

In [57]:
listing_df.show(2)

+------+--------------------+--------------+------------+-----------+--------------------+-----------+---------------------+--------------------+-------+--------------------+-------------+----------+-------------+--------------------+------------------+------------------+--------------------+-----------------+--------------------+--------------------+------------------+-------------------+-------------------------+--------------------+--------------------+----------------------+--------------------+----------------------+----------------------------+--------+---------+--------------------+---------------+------------+---------+--------------+--------+----+---------+-------+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+----------------+---------------+---------------+---------------+----------------+---------------------+-----------------+-----------

# Separating Tables

## Dropping Redundant/Empty Columns

In [58]:
columns_to_remove = [
                        "calendar_last_scraped",
                        "description",
                        "calendar_updated", 
                        "bedrooms", 
                        "bathrooms", 
                        "neighbourhood_group_cleansed", 
                        "amenities"
                    ]
listing_df = listing_df.drop(*columns_to_remove)

## Correcting Misspell

In [59]:
listing_df = listing_df.withColumnRenamed("neighborhood_overview", "neighbourhood_overview")

In [60]:
# Looks like the rows have been dropped
listing_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- neighbourhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: integer (nullable = true)
 |-- host_total_lis

## Transforming Listing Table

In [61]:
listing_df.createOrReplaceTempView("listing_df_view")

In [62]:
from pyspark.sql.functions import udf, col, split
from pyspark.sql.types import StringType, IntegerType

# UDF to transform host_verifications to a 3 character string
# index 0 = email, index 1 = phone, index 2 = work_email. 1 = verified, 0 = not verified
def hqad_df_host_verifications_transform(verifs_list):
   res = ""

   verifs = ["email", "phone", "work_email"]

   for verif in verifs:
      if verif in verifs_list:
         res += "1"
      else:
         res += "0"

   return res


# UDF To convert t/f to 1/0 respectively
def t_f_to_1_0(t_f):
    if t_f == "t":
        return 1
    else:
        return 0


hqad_array_transform = udf(hqad_df_host_verifications_transform, StringType())
truefalse_to_10 = udf(t_f_to_1_0, IntegerType())

# Query to get row number, replace percent to decimal, change host_response_time to shorter key, and change t/f to 1/0
listing_df_query = """
                    SELECT id,
                        host_id,
                        host_url,
                        host_name,
                        host_since,
                        host_location,
                        host_about,
                        host_thumbnail_url,
                        host_picture_url,
                        host_neighbourhood,
                        CASE host_response_time
                            WHEN 'within an hour' THEN 'H'
                            WHEN 'within a few hours' THEN 'FH'
                            WHEN 'within a dat' THEN 'D'
                            WHEN 'a few days or more' THEN 'D+'
                            ELSE NULL
                        END AS host_response_time, 
                        CAST(REPLACE(host_response_rate, '%', '') / 100 AS DECIMAL(3,2)) AS host_response_rate,
                        CAST(REPLACE(host_acceptance_rate, '%', '') / 100 AS DECIMAL(3,2)) AS host_acceptance_rate,
                        host_is_superhost,
                        host_listings_count, 
                        host_total_listings_count, 
                        host_verifications, 
                        host_has_profile_pic,
                        host_identity_verified,
                        calculated_host_listings_count,
                        calculated_host_listings_count_entire_homes,
                        calculated_host_listings_count_private_rooms,
                        calculated_host_listings_count_shared_rooms,
                        latitude,
                        longitude,
                        property_type,
                        room_type,
                        accommodates,
                        CASE
                            WHEN INSTR(bathrooms_text, 'Private half-bath') > 0 THEN '0.5 private bath'
                            WHEN INSTR(bathrooms_text, 'Half-bath') > 0 THEN '0.5 bath'
                            ELSE bathrooms_text
                        END as bathrooms,
                        beds,
                        cast(replace(price, '$', '') as decimal(10,2)) as price,
                        number_of_reviews,
                        number_of_reviews_ltm,
                        number_of_reviews_l30d,
                        first_review,
                        last_review,
                        review_scores_rating,
                        review_scores_accuracy,
                        review_scores_cleanliness,
                        review_scores_checkin,
                        review_scores_communication,
                        review_scores_location,
                        review_scores_value,
                        reviews_per_month,
                        scrape_id,
                        last_scraped,
                        source,
                        neighbourhood,
                        neighbourhood_overview,
                        neighbourhood_cleansed,
                        maximum_nights,
                        minimum_nights,
                        minimum_minimum_nights,
                        maximum_minimum_nights,
                        minimum_maximum_nights,
                        maximum_maximum_nights,
                        minimum_nights_avg_ntm,
                        maximum_nights_avg_ntm,
                        has_availability,
                        availability_30,
                        availability_60,
                        availability_90,
                        availability_365,
                        listing_url,
                        name,
                        picture_url,
                        license,
                        instant_bookable
                    FROM listing_df_view
"""

listing_df = spark.sql(listing_df_query)
# Splitting the bathroom column into two
property_baths_split = split(col("bathrooms"), ' ', limit=2)
# Adding the two new columns
listing_df = listing_df.withColumn("bathroom_desc", property_baths_split.getItem(1))
listing_df = listing_df.withColumn("bathrooms", property_baths_split.getItem(0))
listing_df = listing_df.withColumn("host_verifications", hqad_array_transform(col("host_verifications")))
listing_df = listing_df.withColumns(
   {
      "host_is_superhost": truefalse_to_10(col("host_is_superhost")), 
      "host_has_profile_pic": truefalse_to_10(col("host_has_profile_pic")), 
      "host_identity_verified": truefalse_to_10(col("host_identity_verified")), 
      "has_availability": truefalse_to_10(col("has_availability")), 
      "instant_bookable": truefalse_to_10(col("instant_bookable"))
   }
)

listing_df.show(10)

[Stage 184:>                                                        (0 + 1) / 1]

+-------+-------+--------------------+-----------------+----------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+------------------+--------------------+-----------------+-------------------+-------------------------+------------------+--------------------+----------------------+------------------------------+-------------------------------------------+--------------------------------------------+-------------------------------------------+--------+---------+--------------------+---------------+------------+---------+----+------+-----------------+---------------------+----------------------+------------+-----------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+-----------------+--------------+------------+-----------+--------------------+----------------------+----------------------+-------------

                                                                                

### Whew! That was One Long Transformation!

In [63]:
# Recreating Listings View
listing_df.createOrReplaceTempView("listing_df_view")

## Creating Host Tables

### Host Table

In [64]:
host_df = listing_df.select(
    "host_id",
    "host_url",
    "host_name",
    "host_since",
    "host_location",
    "host_about",
    "host_thumbnail_url",
    "host_picture_url",
    "host_neighbourhood",
    "host_response_time",
    "host_response_rate",
    "host_acceptance_rate",
    "host_is_superhost",
    "host_listings_count",
    "host_total_listings_count",
    "host_verifications",
    "host_has_profile_pic",
    "host_identity_verified",
    "calculated_host_listings_count",
    "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms",
    "calculated_host_listings_count_shared_rooms",
)

host_df = host_df.dropDuplicates()

print("count: " + str(host_df.count()))

host_df.show(2)

                                                                                

count: 977
+---------+--------------------+---------+----------+-------------+--------------------+--------------------+--------------------+------------------+------------------+------------------+--------------------+-----------------+-------------------+-------------------------+------------------+--------------------+----------------------+------------------------------+-------------------------------------------+--------------------------------------------+-------------------------------------------+
|  host_id|            host_url|host_name|host_since|host_location|          host_about|  host_thumbnail_url|    host_picture_url|host_neighbourhood|host_response_time|host_response_rate|host_acceptance_rate|host_is_superhost|host_listings_count|host_total_listings_count|host_verifications|host_has_profile_pic|host_identity_verified|calculated_host_listings_count|calculated_host_listings_count_entire_homes|calculated_host_listings_count_private_rooms|calculated_host_listings_count_sha

                                                                                

### Host Dimension Tables

In [65]:
from pyspark.sql.functions import monotonically_increasing_id
# Host Qualifications and Diagnostics (HQAD)
hqad_df = host_df.select(
    "host_response_time",
    "host_response_rate",
    "host_acceptance_rate",
    "host_is_superhost",
    "host_listings_count",
    "host_total_listings_count",
    "host_verifications",
    "host_has_profile_pic",
    "host_identity_verified"
)

hqad_df = hqad_df.dropDuplicates()

hqad_df = hqad_df.withColumn("hqad_id", monotonically_increasing_id())

print("count: " + str(hqad_df.count()))
hqad_df.printSchema()
hqad_df.show(2)

                                                                                

count: 607
root
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: decimal(3,2) (nullable = true)
 |-- host_acceptance_rate: decimal(3,2) (nullable = true)
 |-- host_is_superhost: integer (nullable = true)
 |-- host_listings_count: integer (nullable = true)
 |-- host_total_listings_count: integer (nullable = true)
 |-- host_verifications: string (nullable = true)
 |-- host_has_profile_pic: integer (nullable = true)
 |-- host_identity_verified: integer (nullable = true)
 |-- hqad_id: long (nullable = false)

+------------------+------------------+--------------------+-----------------+-------------------+-------------------------+------------------+--------------------+----------------------+-------+
|host_response_time|host_response_rate|host_acceptance_rate|host_is_superhost|host_listings_count|host_total_listings_count|host_verifications|host_has_profile_pic|host_identity_verified|hqad_id|
+------------------+------------------+--------------------+-----------

In [66]:
# The Host Listings Diagnostics (HLD)
hld_df = host_df.select(
    "calculated_host_listings_count",
    "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms",
    "calculated_host_listings_count_shared_rooms"
)

hld_df = hld_df.dropDuplicates()

hld_df = hld_df.withColumn("hld_id", monotonically_increasing_id())

print("count: " + str(hld_df.count()))

hld_df.printSchema()
hld_df.show(2)

count: 69
root
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- calculated_host_listings_count_entire_homes: integer (nullable = true)
 |-- calculated_host_listings_count_private_rooms: integer (nullable = true)
 |-- calculated_host_listings_count_shared_rooms: integer (nullable = true)
 |-- hld_id: long (nullable = false)

+------------------------------+-------------------------------------------+--------------------------------------------+-------------------------------------------+------+
|calculated_host_listings_count|calculated_host_listings_count_entire_homes|calculated_host_listings_count_private_rooms|calculated_host_listings_count_shared_rooms|hld_id|
+------------------------------+-------------------------------------------+--------------------------------------------+-------------------------------------------+------+
|                             2|                                          1|                                           1|               

## Next, the Property Dimension Table

In [67]:
property_df = listing_df.select(
    "latitude",
    "longitude", 
    "property_type", 
    "room_type", 
    "accommodates",
    "bathrooms",
    "bathroom_desc",
    "beds",
    "price"
)

property_df = property_df.dropDuplicates()

property_df = property_df.withColumn("property_id", monotonically_increasing_id())

print("count: " + str(property_df.count()))

property_df.printSchema()
property_df.show(2)

count: 2649
root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- property_type: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: integer (nullable = true)
 |-- bathrooms: string (nullable = true)
 |-- bathroom_desc: string (nullable = true)
 |-- beds: integer (nullable = true)
 |-- price: decimal(10,2) (nullable = true)
 |-- property_id: long (nullable = false)

+--------+---------+--------------------+---------------+------------+---------+-------------+----+-----+-----------+
|latitude|longitude|       property_type|      room_type|accommodates|bathrooms|bathroom_desc|beds|price|property_id|
+--------+---------+--------------------+---------------+------------+---------+-------------+----+-----+-----------+
|40.02415|-83.03233|Private room in home|   Private room|           1|      1.5| shared baths|   1|32.00|          0|
|40.01803|-83.00057|  Entire rental unit|Entire home/apt|           1|        1|         

## Now the Reviews Diagnostics Table

In [68]:
reviews_diagnostics_df = listing_df.select(
    "number_of_reviews",
    "number_of_reviews_ltm",
    "number_of_reviews_l30d",
    "first_review",
    "last_review",
    "review_scores_rating",
    "review_scores_accuracy",
    "review_scores_cleanliness",
    "review_scores_checkin",
    "review_scores_communication",
    "review_scores_location",
    "review_scores_value",
    "reviews_per_month"
)

reviews_diagnostics_df = reviews_diagnostics_df.dropDuplicates()

reviews_diagnostics_df = reviews_diagnostics_df.withColumn("rev_diag_id", monotonically_increasing_id())

print("count: " + str(reviews_diagnostics_df.count()))

reviews_diagnostics_df.printSchema()
reviews_diagnostics_df.show(5, truncate=False)

count: 2272
root
 |-- number_of_reviews: integer (nullable = true)
 |-- number_of_reviews_ltm: integer (nullable = true)
 |-- number_of_reviews_l30d: integer (nullable = true)
 |-- first_review: date (nullable = true)
 |-- last_review: date (nullable = true)
 |-- review_scores_rating: double (nullable = true)
 |-- review_scores_accuracy: double (nullable = true)
 |-- review_scores_cleanliness: double (nullable = true)
 |-- review_scores_checkin: double (nullable = true)
 |-- review_scores_communication: double (nullable = true)
 |-- review_scores_location: double (nullable = true)
 |-- review_scores_value: double (nullable = true)
 |-- reviews_per_month: double (nullable = true)
 |-- rev_diag_id: long (nullable = false)

+-----------------+---------------------+----------------------+------------+-----------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+-----------------

## Scrapings Dimension Table

In [69]:
scrapings_df = listing_df.select(
    "scrape_id",
    "last_scraped",
    "source"
)

scrapings_df = scrapings_df.dropDuplicates()

scrapings_df = scrapings_df.withColumn("scraping_id", monotonically_increasing_id())

print("count: " + str(scrapings_df.count()))

scrapings_df.printSchema()
scrapings_df.show(5, truncate=False)

count: 3
root
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- scraping_id: long (nullable = false)

+--------------+------------+---------------+-----------+
|scrape_id     |last_scraped|source         |scraping_id|
+--------------+------------+---------------+-----------+
|20231225202549|2023-12-26  |previous scrape|0          |
|20231225202549|2023-12-26  |city scrape    |1          |
|20231225202549|2023-12-25  |city scrape    |2          |
+--------------+------------+---------------+-----------+



## Neighbourhood Dimension Table


In [70]:
neighbourhood_df = listing_df.select(
    "neighbourhood",
    "neighbourhood_overview",
    "neighbourhood_cleansed"
)

neighbourhood_df = neighbourhood_df.dropDuplicates()

neighbourhood_df = neighbourhood_df.withColumn("neighbourhood_id", monotonically_increasing_id())

print("count: " + str(neighbourhood_df.count()))

neighbourhood_df.printSchema()
neighbourhood_df.show(5)

count: 1299
root
 |-- neighbourhood: string (nullable = true)
 |-- neighbourhood_overview: string (nullable = true)
 |-- neighbourhood_cleansed: string (nullable = true)
 |-- neighbourhood_id: long (nullable = false)

+--------------------+----------------------+----------------------+----------------+
|       neighbourhood|neighbourhood_overview|neighbourhood_cleansed|neighbourhood_id|
+--------------------+----------------------+----------------------+----------------+
|Hilliard, Ohio, U...|  We're about a 15 ...|              Far West|               0|
|Columbus, Ohio, U...|  Situated in a his...|             Near East|               1|
|Columbus, Ohio, U...|  The Old North nei...|  Near North/Univer...|               2|
|Columbus, Ohio, U...|  The neighborhood ...|          North Linden|               3|
|Columbus, Ohio, U...|  You are in the Cr...|        West Olentangy|               4|
+--------------------+----------------------+----------------------+----------------+
only sho

## MinMax Insights Dimension Table

In [71]:
minmax_insights_df = listing_df.select(
    "maximum_nights",
    "minimum_nights",
    "minimum_minimum_nights",
    "maximum_minimum_nights",
    "minimum_maximum_nights",
    "maximum_maximum_nights",
    "minimum_nights_avg_ntm",
    "maximum_nights_avg_ntm"
)

minmax_insights_df = minmax_insights_df.dropDuplicates()

minmax_insights_df = minmax_insights_df.withColumn("minmax_insights_id", monotonically_increasing_id())

print("count: " + str(minmax_insights_df.count()))

minmax_insights_df.printSchema()
minmax_insights_df.show(5, truncate=False)

count: 795
root
 |-- maximum_nights: integer (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- minimum_minimum_nights: integer (nullable = true)
 |-- maximum_minimum_nights: integer (nullable = true)
 |-- minimum_maximum_nights: integer (nullable = true)
 |-- maximum_maximum_nights: integer (nullable = true)
 |-- minimum_nights_avg_ntm: double (nullable = true)
 |-- maximum_nights_avg_ntm: double (nullable = true)
 |-- minmax_insights_id: long (nullable = false)

+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+------------------+
|maximum_nights|minimum_nights|minimum_minimum_nights|maximum_minimum_nights|minimum_maximum_nights|maximum_maximum_nights|minimum_nights_avg_ntm|maximum_nights_avg_ntm|minmax_insights_id|
+--------------+--------------+----------------------+----------------------+----------------------+----------------------+-----------

## Availibility Dimension Table

In [72]:
availibility_df = listing_df.select(
    "has_availability",
    "availability_30",
    "availability_60",
    "availability_90",
    "availability_365"
)

availibility_df = availibility_df.dropDuplicates()

availibility_df = availibility_df.withColumn("avail_id", monotonically_increasing_id())

print("count: " + str(availibility_df.count()))

availibility_df.printSchema()
availibility_df.show(5, truncate=False)

count: 1336
root
 |-- has_availability: integer (nullable = true)
 |-- availability_30: integer (nullable = true)
 |-- availability_60: integer (nullable = true)
 |-- availability_90: integer (nullable = true)
 |-- availability_365: integer (nullable = true)
 |-- avail_id: long (nullable = false)

+----------------+---------------+---------------+---------------+----------------+--------+
|has_availability|availability_30|availability_60|availability_90|availability_365|avail_id|
+----------------+---------------+---------------+---------------+----------------+--------+
|1               |1              |31             |61             |151             |0       |
|1               |0              |0              |0              |18              |1       |
|1               |22             |52             |82             |357             |2       |
|1               |23             |49             |73             |345             |3       |
|1               |0              |0              |

# Joining Tables Together

### Host Tables

In [73]:
hqad_host_conditions = [
    "host_response_time",
    "host_response_rate",
    "host_acceptance_rate",
    "host_is_superhost",
    "host_listings_count",
    "host_total_listings_count",
    "host_verifications",
    "host_has_profile_pic",
    "host_identity_verified"
]

hld_host_conditions = [
    "calculated_host_listings_count",
    "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms",
    "calculated_host_listings_count_shared_rooms"
]


host_df = host_df.join(hqad_df, on=hqad_host_conditions, how="left")\
                 .join(hld_df, on=hld_host_conditions, how="left")\
                 .select(
    "host_id",
    "host_url",
    "host_name",
    "host_since",
    "host_location",
    "host_about",
    "host_thumbnail_url",
    "host_picture_url",
    "host_neighbourhood",
    "hld_id",
    "hqad_id"
)

In [74]:
print("count: " + str(host_df.count()))

host_df.show(5)

count: 977


                                                                                

+---------+--------------------+---------+----------+-------------+--------------------+--------------------+--------------------+--------------------+------+-------+
|  host_id|            host_url|host_name|host_since|host_location|          host_about|  host_thumbnail_url|    host_picture_url|  host_neighbourhood|hld_id|hqad_id|
+---------+--------------------+---------+----------+-------------+--------------------+--------------------+--------------------+--------------------+------+-------+
|182105923|https://www.airbn...|    Holly|2018-04-02| Columbus, OH|Food, nature, art...|https://a0.muscac...|https://a0.muscac...|      Dennison Place|     2|    409|
| 56664331|https://www.airbn...|   Wesley|2016-01-29| Columbus, OH|Young traveler lo...|https://a0.muscac...|https://a0.muscac...|      Near East Side|     7|    467|
|  8325110|https://www.airbn...|     Tara|2013-08-22| Columbus, OH|Hello, worldwide ...|https://a0.muscac...|https://a0.muscac...|           Northland|     2|    471

## Final Join

In [75]:
property_join_conditions = [
    "latitude",
    "longitude",
    "price",
    "beds",
    "bathroom_desc",
    "accommodates"
]

reviews_join_conditions = [
    "number_of_reviews",
    "number_of_reviews_ltm",
    "number_of_reviews_l30d",
    "first_review",
    "last_review",
    "review_scores_rating",
    "review_scores_accuracy",
    "review_scores_cleanliness",
    "review_scores_checkin",
    "review_scores_communication",
    "review_scores_location",
    "review_scores_value",
    "reviews_per_month"
]

minmax_join_conditions = [
    "minimum_nights",
    "maximum_nights",
    "minimum_minimum_nights",
    "maximum_minimum_nights",
    "minimum_maximum_nights",
    "maximum_maximum_nights",
    "minimum_nights_avg_ntm",
    "maximum_nights_avg_ntm"
]



# final_df =
listing_df.join(host_df, on="host_id", how="left")\
          .join(property_df, on=property_join_conditions, how="left")\
          .join(reviews_diagnostics_df, on=reviews_join_conditions, how="left")\
          .join(scrapings_df, on=["last_scraped", "source"], how="left")\
          .join(neighbourhood_df, on=["neighbourhood_overview", "neighbourhood_cleansed"], how="left")\
          .join(minmax_insights_df, on=minmax_join_conditions, how="left")\
          .join(availibility_df, on=["has_availability", "availability_30", "availability_60", "availability_90", "availability_365"], how="left")\
          .select(
              "id",
              "scraping_id",
              "host_id",
              "neighbourhood_id",
              "property_id",
              "minmax_insights_id",
              "avail_id",
              "rev_diag_id",
              "listing_url",
              "name",
              "picture_url",
              "license",
              "instant_bookable"
          ).show(2)

[Stage 302:>                                                        (0 + 1) / 1]

+------+-----------+-------+----------------+-----------+------------------+--------+-----------+--------------------+--------------------+--------------------+---------+----------------+
|    id|scraping_id|host_id|neighbourhood_id|property_id|minmax_insights_id|avail_id|rev_diag_id|         listing_url|                name|         picture_url|  license|instant_bookable|
+------+-----------+-------+----------------+-----------+------------------+--------+-----------+--------------------+--------------------+--------------------+---------+----------------+
| 90676|          1| 483306|              25|        353|               750|    1149|        238|https://www.airbn...|Home in Columbus ...|https://a0.muscac...|2022-2475|               0|
|543140|          2|2350409|             800|       1234|               657|     185|        882|https://www.airbn...|Home in Columbus ...|https://a0.muscac...|2019-1344|               0|
+------+-----------+-------+----------------+-----------+---

                                                                                