# Loading the Data

First step is getting all the CSV files loaded into Spark so we can work with them.

In [1]:
import os
import sys

os.environ['HADOOP_HOME'] = r'C:\hadoop'
os.environ['hadoop.home.dir'] = r'C:\hadoop'
os.environ['PATH'] = r'C:\hadoop\bin;' + os.environ['PATH']

print("Hadoop configured for Windows")


 Hadoop configured for Windows


Initializing spark for my system

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("LondonAirbnbAnalysis") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

print(f"Spark version: {spark.version}")

Spark version: 3.4.0


In [3]:
# Load with better options to handle messy CSV data
listings_raw = spark.read.csv(
    "data/listings.csv", 
    header=True, 
    inferSchema=True,
    escape='"',          
    multiLine=True,       
    mode="PERMISSIVE"     
)

reviews_raw = spark.read.csv(
    "data/reviews.csv", 
    header=True, 
    inferSchema=True,
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)

calendar_raw = spark.read.csv(
    "data/calendar.csv", 
    header=True, 
    inferSchema=True,
    escape='"',
    mode="PERMISSIVE"
)

print(f"\nTotal listings: {listings_raw.count()}")
print(f"Total reviews: {reviews_raw.count()}")
print(f"Total calendar: {calendar_raw.count()}")



Total listings: 96871
Total reviews: 2097996
Total calendar: 35357974


In [6]:
from pyspark.sql.functions import col, when, isnan

# Show some price values to see what's wrong
print("\nSample price values:")
listings_raw.select("id", "price", "neighbourhood", "room_type").show(20, truncate=False)

# Count null/invalid prices
null_prices = listings_raw.filter(col("price").isNull()).count()
print(f"\nNull prices: {null_prices}")

# Try to see what's in the price column
print("\nPrice column data types and samples:")
listings_raw.select("price").dtypes
listings_raw.groupBy("price").count().orderBy("count", ascending=False).show(20, truncate=False)



Sample price values:
+-----+-------+-----------------------+---------------+
|id   |price  |neighbourhood          |room_type      |
+-----+-------+-----------------------+---------------+
|13913|$70.00 |Neighborhood highlights|Private room   |
|15400|$149.00|Neighborhood highlights|Entire home/apt|
|17402|$411.00|Neighborhood highlights|Entire home/apt|
|24328|null   |Neighborhood highlights|Entire home/apt|
|36274|$210.00|null                   |Entire home/apt|
|36299|$280.00|Neighborhood highlights|Entire home/apt|
|36660|$90.00 |Neighborhood highlights|Private room   |
|38605|$61.00 |Neighborhood highlights|Private room   |
|38610|$340.00|Neighborhood highlights|Entire home/apt|
|38995|$49.00 |Neighborhood highlights|Private room   |
|39387|null   |Neighborhood highlights|Private room   |
|41445|$213.00|Neighborhood highlights|Entire home/apt|
|41509|null   |Neighborhood highlights|Private room   |
|41712|$96.00 |Neighborhood highlights|Private room   |
|41870|null   |Neighborhoo

In [7]:
listings_raw.write.mode("overwrite").parquet("output/raw_listings.parquet")
reviews_raw.write.mode("overwrite").parquet("output/raw_reviews.parquet")
calendar_raw.write.mode("overwrite").parquet("output/raw_calendar.parquet")

print("Raw data saved!")
print("\nNext: Open notebook 02 to check data quality")

Raw data saved!

Next: Open notebook 02 to check data quality
