# Checking Data Quality

Before cleaning, let's see what problems exist in the data.

In [1]:
import os
import sys

# Windows Hadoop fix
os.environ['HADOOP_HOME'] = r'C:\hadoop'
os.environ['hadoop.home.dir'] = r'C:\hadoop'
os.environ['PATH'] = r'C:\hadoop\bin;' + os.environ['PATH']

print("Hadoop configured for Windows")

Hadoop configured for Windows


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when

spark = SparkSession.builder.appName("DataQuality").config("spark.driver.memory", "4g").getOrCreate()

listings = spark.read.parquet("output/raw_listings.parquet")
reviews = spark.read.parquet("output/raw_reviews.parquet")
calendar = spark.read.parquet("output/raw_calendar.parquet")

Displaying count

In [3]:
print(f"Listings: {listings.count()}")
print(f"Reviews: {reviews.count()}")
print(f"Calendar: {calendar.count()}")

Listings: 96871
Reviews: 2097996
Calendar: 35357974


Checking for missing values in important columns:

In [4]:
listings.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in ["price", "neighbourhood", "room_type", "host_since"]]
).show()

+-----+-------------+---------+----------+
|price|neighbourhood|room_type|host_since|
+-----+-------------+---------+----------+
|34908|        55662|        0|        41|
+-----+-------------+---------+----------+



Looking at price values:

In [5]:
listings.select("price").describe().show()

invalid_prices = listings.filter((col("price") <= 0) | (col("price") > 10000))
print(f"\nInvalid prices found: {invalid_prices.count()}")

+-------+---------+
|summary|    price|
+-------+---------+
|  count|    61963|
|   mean|     null|
| stddev|     null|
|    min|$1,000.00|
|    max|  $999.00|
+-------+---------+


Invalid prices found: 0


Checking reviews:

In [6]:
# London only dataset - no city grouping needed
null_dates = reviews.filter(col("date").isNull())
print(f"Reviews with missing dates: {null_dates.count()}")

Reviews with missing dates: 0


Checking calendar:

In [7]:
# London only dataset - no city grouping needed
print("Availability values:")
calendar.groupBy("available").count().show()

Availability values:
+---------+--------+
|available|   count|
+---------+--------+
|        f|21313866|
|        t|14044108|
+---------+--------+

