# Cleaning the Data

Hadoop Fix

In [11]:
import os
import sys

os.environ['HADOOP_HOME'] = r'C:\hadoop'
os.environ['hadoop.home.dir'] = r'C:\hadoop'
os.environ['PATH'] = r'C:\hadoop\bin;' + os.environ['PATH']

print("Hadoop configured for Windows")


Hadoop configured for Windows


In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, to_date, trim, lower, when, length, coalesce, lit
from pyspark.sql.types import DoubleType, IntegerType, FloatType

spark = SparkSession.builder \
    .appName("LondonDataCleaning") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

listings = spark.read.parquet("output/raw_listings.parquet")
reviews = spark.read.parquet("output/raw_reviews.parquet")
calendar = spark.read.parquet("output/raw_calendar.parquet")

print(f"Loaded {listings.count()} listings")

Loaded 96871 listings


In [13]:
# First, let's see what we're dealing with
print("\nBefore cleaning - sample prices:")
listings.select("price").show(10, truncate=False)

# Clean price - handle multiple formats
listings_clean = listings.withColumn(
    "price_clean",
    # Remove $, £, commas, and any other non-numeric characters except decimal point
    regexp_replace(col("price"), "[^0-9.]", "")
)

# Convert to double, handling empty strings
listings_clean = listings_clean.withColumn(
    "price_numeric",
    when(col("price_clean") == "", None)
    .otherwise(col("price_clean").cast(DoubleType()))
)
print("\nAfter initial cleaning:")
listings_clean.select("price", "price_clean", "price_numeric").show(10, truncate=False)

before = listings_clean.count()
listings_clean = listings_clean.filter(col("price_numeric").isNotNull())
listings_clean = listings_clean.filter(col("price_numeric") > 0)
listings_clean = listings_clean.filter(col("price_numeric") < 50000)  

# Replace original price with cleaned price
listings_clean = listings_clean.withColumn("price", col("price_numeric")).drop("price_clean", "price_numeric")

after = listings_clean.count()
print(f"\nPrice filtering: {before} → {after} (removed {before - after})")


Before cleaning - sample prices:
+-------+
|price  |
+-------+
|$70.00 |
|$149.00|
|$411.00|
|null   |
|$210.00|
|$280.00|
|$90.00 |
|$61.00 |
|$340.00|
|$49.00 |
+-------+
only showing top 10 rows


After initial cleaning:
+-------+-----------+-------------+
|price  |price_clean|price_numeric|
+-------+-----------+-------------+
|$70.00 |70.00      |70.0         |
|$149.00|149.00     |149.0        |
|$411.00|411.00     |411.0        |
|null   |null       |null         |
|$210.00|210.00     |210.0        |
|$280.00|280.00     |280.0        |
|$90.00 |90.00      |90.0         |
|$61.00 |61.00      |61.0         |
|$340.00|340.00     |340.0        |
|$49.00 |49.00      |49.0         |
+-------+-----------+-------------+
only showing top 10 rows


Price filtering: 96871 → 61954 (removed 34917)


In [14]:
# Convert to integer, but be lenient
listings_clean = listings_clean.withColumn(
    "number_of_reviews_clean",
    when(col("number_of_reviews").isNull(), 0)
    .when(col("number_of_reviews").cast(IntegerType()).isNull(), 0)
    .otherwise(col("number_of_reviews").cast(IntegerType()))
)

# Only filter out negative values
before = listings_clean.count()
listings_clean = listings_clean.filter(col("number_of_reviews_clean") >= 0)
listings_clean = listings_clean.withColumn("number_of_reviews", col("number_of_reviews_clean")).drop("number_of_reviews_clean")

after = listings_clean.count()
print(f"Number of reviews filtering: {before} → {after} (removed {before - after})")


Number of reviews filtering: 61954 → 61954 (removed 0)


In [15]:
# Clean room_type - be more flexible
listings_clean = listings_clean.withColumn(
    "room_type_temp",
    lower(trim(regexp_replace(col("room_type"), '["\']', '')))
)

# Map to standard types, but keep unknowns as "other"
listings_clean = listings_clean.withColumn(
    "room_type_clean",
    when(col("room_type_temp").contains("entire"), "Entire home/apt")
    .when(col("room_type_temp").contains("private"), "Private room")
    .when(col("room_type_temp").contains("shared"), "Shared room")
    .when(col("room_type_temp").contains("hotel"), "Hotel room")
    .when(col("room_type_temp").isNull(), "Unknown")
    .when(col("room_type_temp") == "", "Unknown")
    .otherwise("Other")  # Keep instead of filtering out
)

listings_clean = listings_clean.withColumn("room_type", col("room_type_clean")).drop("room_type_temp", "room_type_clean")

print(f"Room types after cleaning:")
listings_clean.groupBy("room_type").count().show()


Room types after cleaning:
+---------------+-----+
|      room_type|count|
+---------------+-----+
|    Shared room|  191|
|     Hotel room|   72|
|Entire home/apt|42316|
|   Private room|19375|
+---------------+-----+



In [16]:
# Clean neighbourhood 
listings_clean = listings_clean.withColumn(
    "neighbourhood_clean",
    when(col("neighbourhood").isNull(), "Unknown")
    .when(trim(col("neighbourhood")) == "", "Unknown")
    .otherwise(trim(regexp_replace(col("neighbourhood"), '["\']', '')))
)

# Only filter out very short or purely numeric neighbourhoods
before = listings_clean.count()
listings_clean = listings_clean.filter(
    (length(col("neighbourhood_clean")) >= 2) &  
    (~col("neighbourhood_clean").rlike("^[0-9]+$")) 
)

listings_clean = listings_clean.withColumn("neighbourhood", col("neighbourhood_clean")).drop("neighbourhood_clean")

after = listings_clean.count()
print(f"Neighbourhood filtering: {before} → {after} (removed {before - after})")


Neighbourhood filtering: 61954 → 61954 (removed 0)


In [17]:
listings_clean = listings_clean.withColumn(
    "host_since", 
    to_date(col("host_since"))
)

print(f"Records with valid host_since: {listings_clean.filter(col('host_since').isNotNull()).count()}")

Records with valid host_since: 61929


In [18]:
# Only drop if CRITICAL fields are missing
before = listings_clean.count()
listings_clean = listings_clean.dropna(subset=["id", "price"])  # Only require ID and price

# Fill other missing values with defaults
listings_clean = listings_clean.fillna({
    "neighbourhood": "Unknown",
    "room_type": "Unknown",
    "bedrooms": 1,
    "beds": 1,
    "bathrooms": 1,
    "review_scores_rating": 0,
    "accommodates": 1,
    "number_of_reviews": 0
})

after = listings_clean.count()
print(f"After handling missing values: {before} → {after} (removed {before - after})")
print(f"\n Final clean listings: {after}")


After handling missing values: 61954 → 61954 (removed 0)

 Final clean listings: 61954


In [19]:
reviews_clean = reviews.withColumn(
    "date", to_date(col("date"))
).filter(
    col("date").isNotNull()
).dropDuplicates(["listing_id", "date", "reviewer_id"])

print(f"Clean reviews: {reviews_clean.count()}")


Clean reviews: 2097952


In [20]:
calendar_clean = calendar.withColumn(
    "price_clean", regexp_replace(col("price"), "[^0-9.]", "")
).withColumn(
    "price", 
    when(col("price_clean") == "", None)
    .otherwise(col("price_clean").cast(DoubleType()))
).drop("price_clean").withColumn(
    "date", to_date(col("date"))
).withColumn(
    "available", lower(trim(col("available")))
).filter(
    col("date").isNotNull()
)

print(f"Clean calendar: {calendar_clean.count()}")


Clean calendar: 35357974


In [21]:
listings_clean.write.mode("overwrite").parquet("output/listings_clean.parquet")
reviews_clean.write.mode("overwrite").parquet("output/reviews_clean.parquet")
calendar_clean.write.mode("overwrite").parquet("output/calendar_clean.parquet")

print("\n Clean data saved.")
print(f"\nFinal counts:")
print(f"  Listings: {listings_clean.count()}")
print(f"  Reviews: {reviews_clean.count()}")
print(f"  Calendar: {calendar_clean.count()}")

# Show sample of clean data
print("\nSample of clean listings:")
listings_clean.select("id", "name", "price", "neighbourhood", "room_type", "number_of_reviews").show(10, truncate=False)



 Clean data saved.

Final counts:
  Listings: 61954
  Reviews: 2097952
  Calendar: 35357974

Sample of clean listings:
+-----+-------------------------------------------------+-----+-----------------------+---------------+-----------------+
|id   |name                                             |price|neighbourhood          |room_type      |number_of_reviews|
+-----+-------------------------------------------------+-----+-----------------------+---------------+-----------------+
|13913|Holiday London DB Room Let-on going              |70.0 |Neighborhood highlights|Private room   |55               |
|15400|Bright Chelsea  Apartment. Chelsea!              |149.0|Neighborhood highlights|Entire home/apt|97               |
|17402|Very Central Modern 3-Bed/2 Bath By Oxford St W1 |411.0|Neighborhood highlights|Entire home/apt|56               |
|36274|Bright 1 bedroom apt off brick lane in Shoreditch|210.0|Unknown                |Entire home/apt|15               |
|36299|Kew Gardens 3BR hou