# Creating New Features

Adding useful calculated fields to help with analysis.

In [1]:
import os
import sys

# Windows Hadoop fix
os.environ['HADOOP_HOME'] = r'C:\hadoop'
os.environ['hadoop.home.dir'] = r'C:\hadoop'
os.environ['PATH'] = r'C:\hadoop\bin;' + os.environ['PATH']

print("Hadoop configured for Windows")

Hadoop configured for Windows


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("FeatureEngineering").config("spark.driver.memory", "4g").getOrCreate()

listings = spark.read.parquet("output/listings_clean.parquet")
reviews = spark.read.parquet("output/reviews_clean.parquet")
calendar = spark.read.parquet("output/calendar_clean.parquet")

print(f"Loaded {listings.count()} listings")

Loaded 61954 listings


New feature added as host experience years

In [3]:
listings = listings.withColumn(
    "host_experience_years",
    (datediff(current_date(), col("host_since")) / 365).cast("int")
).fillna({"host_experience_years": 0})

listings.select("host_since", "host_experience_years").show(5)

+----------+---------------------+
|host_since|host_experience_years|
+----------+---------------------+
|2009-11-16|                   16|
|2009-12-05|                   15|
|2010-01-04|                   15|
|2010-05-27|                   15|
|2010-06-30|                   15|
+----------+---------------------+
only showing top 5 rows



Grouping prices into categories:

In [4]:
listings = listings.withColumn(
    "price_band",
    when(col("price") < 100, "low")
    .when((col("price") >= 100) & (col("price") < 300), "medium")
    .otherwise("high")
)

listings.groupBy("price_band").count().show()

+----------+-----+
|price_band|count|
+----------+-----+
|       low|21944|
|      high| 8972|
|    medium|31038|
+----------+-----+



New feature Cost per person:

In [5]:
listings = listings.withColumn(
    "price_per_person",
    when(col("accommodates") > 0, col("price") / col("accommodates")).otherwise(col("price"))
)

listings.select("price", "accommodates", "price_per_person").show(5)

+-----+------------+----------------+
|price|accommodates|price_per_person|
+-----+------------+----------------+
| 70.0|           1|            70.0|
|149.0|           2|            74.5|
|411.0|           6|            68.5|
|210.0|           4|            52.5|
|280.0|           5|            56.0|
+-----+------------+----------------+
only showing top 5 rows



Reviews per year 

In [6]:
listings = listings.withColumn(
    "review_rate",
    when(col("host_experience_years") > 0, col("number_of_reviews") / col("host_experience_years")).otherwise(col("number_of_reviews"))
)

listings.select("number_of_reviews", "host_experience_years", "review_rate").show(5)

+-----------------+---------------------+------------------+
|number_of_reviews|host_experience_years|       review_rate|
+-----------------+---------------------+------------------+
|               55|                   16|            3.4375|
|               97|                   15| 6.466666666666667|
|               56|                   15|3.7333333333333334|
|               15|                   15|               1.0|
|              116|                   15| 7.733333333333333|
+-----------------+---------------------+------------------+
only showing top 5 rows



Summarizing reviews for each listing:

In [7]:
reviews_agg = reviews.groupBy("listing_id").agg(
    count("*").alias("total_reviews"),
    max("date").alias("last_review_date"),
    min("date").alias("first_review_date")
)

reviews_agg.show(5)

+----------+-------------+----------------+-----------------+
|listing_id|total_reviews|last_review_date|first_review_date|
+----------+-------------+----------------+-----------------+
|    466017|           28|      2018-08-12|       2013-09-30|
|    991477|            5|      2025-07-23|       2022-03-05|
|   2557853|           92|      2025-08-18|       2014-04-28|
|   2736493|            4|      2023-12-20|       2021-07-21|
|   3132302|            3|      2017-01-07|       2014-07-04|
+----------+-------------+----------------+-----------------+
only showing top 5 rows



Calculating availability from calendar:

In [8]:
calendar_agg = calendar.groupBy("listing_id").agg(
    sum(when(col("available") == "t", 1).otherwise(0)).alias("available_days"),
    count("*").alias("total_days"),
    avg("price").alias("avg_calendar_price")
).withColumn(
    "availability_rate",
    col("available_days") / col("total_days")
)

calendar_agg.show(5)

+-------------------+--------------+----------+------------------+------------------+
|         listing_id|available_days|total_days|avg_calendar_price| availability_rate|
+-------------------+--------------+----------+------------------+------------------+
|1389714044313495878|           347|       365|              null|0.9506849315068493|
|1395519410525879568|             0|       365|              null|               0.0|
|1395880156017487276|           361|       365|              null| 0.989041095890411|
|1398106508045788005|           213|       365|              null|0.5835616438356165|
|1399429472922023617|           157|       365|              null|0.4301369863013699|
+-------------------+--------------+----------+------------------+------------------+
only showing top 5 rows



Combining everything together:

In [9]:
enriched = listings.join(
    reviews_agg,
    listings.id == reviews_agg.listing_id,
    "left"
).drop(reviews_agg.listing_id)

enriched = enriched.join(
    calendar_agg,
    enriched.id == calendar_agg.listing_id,
    "left"
).drop(calendar_agg.listing_id)

print(f"Final dataset: {enriched.count()} records")

Final dataset: 61954 records


In [10]:
enriched.write.mode("overwrite").parquet("output/enriched_data.parquet")

print("Enriched data saved.")

# London only dataset - show neighbourhood instead of city
print("\nSample of enriched data:")
enriched.select("neighbourhood", "price", "price_band", "price_per_person", "host_experience_years", "review_rate").show(10)

Enriched data saved.

Sample of enriched data:
+--------------------+-----+----------+-----------------+---------------------+------------------+
|       neighbourhood|price|price_band| price_per_person|host_experience_years|       review_rate|
+--------------------+-----+----------+-----------------+---------------------+------------------+
|Neighborhood high...| 70.0|       low|             70.0|                   16|            3.4375|
|Neighborhood high...|149.0|    medium|             74.5|                   15| 6.466666666666667|
|Neighborhood high...|411.0|      high|             68.5|                   15|3.7333333333333334|
|             Unknown|210.0|    medium|             52.5|                   15|               1.0|
|Neighborhood high...|280.0|    medium|             56.0|                   15| 7.733333333333333|
|Neighborhood high...| 90.0|       low|             45.0|                   15|48.666666666666664|
|Neighborhood high...| 61.0|       low|             30.5|     