# Notebook 02: Cleaning & Feature Engineering

**TerraFlow Analytics - Big Data Assessment**

This notebook focuses on processing the raw "bronze" data into a clean "silver" dataset. It addresses the requirements for data cleaning, structureing, and feature engineering to support downstream analysis and machine learning.

**Objectives:**
1. **Data Cleaning**: Handle missing values, fix data types, and remove invalid records.
2. **Feature Engineering**: Create new variables for analysis (Peak/Off-Peak, Congestion Levels, Temporal Features).
3. **Reliability Analysis**: Engineer trip reliability indicators based on SRI.
4. **Save Silver Layer**: Store the processed dataset back to HDFS for efficient querying.

In [1]:
# Configuration - Define HDFS Paths
# These paths align with the data pipeline from Notebook 01

HDFS_NAMENODE = "hdfs://namenode:9000"

# Input: Bronze layer from Notebook 01
BRONZE_INPUT_PATH = f"{HDFS_NAMENODE}/terraflow/data/processed/gtfs_bronze.parquet"

# Output: Silver layer (cleaned and feature-engineered data)
SILVER_OUTPUT_PATH = f"{HDFS_NAMENODE}/terraflow/data/processed/gtfs_silver.parquet"

# Output: Route-level statistics for dashboards
ROUTE_STATS_OUTPUT_PATH = f"{HDFS_NAMENODE}/terraflow/data/processed/route_stats.parquet"

print("âœ… Configuration loaded")
print(f"Bronze Input : {BRONZE_INPUT_PATH}")
print(f"Silver Output: {SILVER_OUTPUT_PATH}")
print(f"Route Stats  : {ROUTE_STATS_OUTPUT_PATH}")

âœ… Configuration loaded
Bronze Input : hdfs://namenode:9000/terraflow/data/processed/gtfs_bronze.parquet
Silver Output: hdfs://namenode:9000/terraflow/data/processed/gtfs_silver.parquet
Route Stats  : hdfs://namenode:9000/terraflow/data/processed/route_stats.parquet


In [2]:
# Initialize Spark Session
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

print("Initializing Spark session...")

# Stop any existing session
try:
    if 'spark' in globals() and spark is not None:
        spark.stop()
except Exception:
    pass

try:
    SparkSession._instantiatedSession = None
except Exception:
    pass


spark = (
    SparkSession.builder
    .appName("TerraFlow_DataCleaning")
    .master("local[4]")  # 4 parallel threads for distributed processing
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000")
    
    # Performance optimization
    .config("spark.driver.memory", "2g")
    .config("spark.sql.shuffle.partitions", "4")
    .config("spark.default.parallelism", "4")
    
    # HDFS connection settings
    .config("spark.hadoop.dfs.client.use.datanode.hostname", "true")
    
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")

print("=" * 70)
print("[SUCCESS] SPARK SESSION INITIALIZED")
print("=" * 70)
print("Spark Version :", spark.version)
print("defaultFS     :", spark._jsc.hadoopConfiguration().get("fs.defaultFS"))
print("Parallelism   :", spark.sparkContext.defaultParallelism)
print("=" * 70)



Initializing Spark session...
[SUCCESS] SPARK SESSION INITIALIZED
Spark Version : 3.3.2
defaultFS     : hdfs://namenode:9000
Parallelism   : 4


In [3]:
# 1. Load Bronze Data
print("ðŸ“Š Loading Bronze Data from HDFS...")
df = spark.read.parquet(BRONZE_INPUT_PATH)

# Display Schema
print("\n" + "="*80)
print("DATA SCHEMA")
print("="*80)
df.printSchema()

# Show sample records
print("\n" + "="*80)
print("SAMPLE RECORDS (First 5 rows)")
print("="*80)
df.show(5, truncate=False)

print("âœ… Data loaded successfully")

ðŸ“Š Loading Bronze Data from HDFS...

DATA SCHEMA
root
 |-- stop_id_from: integer (nullable = true)
 |-- stop_id_to: integer (nullable = true)
 |-- trip_id: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- time: double (nullable = true)
 |-- speed: string (nullable = true)
 |-- Number_of_trips: integer (nullable = true)
 |-- SRI: string (nullable = true)
 |-- Degree_of_congestion: string (nullable = true)


SAMPLE RECORDS (First 5 rows)
+------------+----------+------------------------------------------------------------------+------------+-----------+-----------+---------------+------------+--------------------+
|stop_id_from|stop_id_to|trip_id                                                           |arrival_time|time       |speed      |Number_of_trips|SRI         |Degree_of_congestion|
+------------+----------+------------------------------------------------------------------+------------+-----------+-----------+---------------+------------+---------------

In [4]:
# 2. Data Cleaning & Type Casting
from pyspark.sql.functions import col, sum as spark_sum, when
from pyspark.sql.types import DoubleType, IntegerType

# Get initial count
initial_count = df.count()
print(f"\nðŸ“Š Initial record count: {initial_count:,}")

# Check for missing values
print("\n" + "="*80)
print("DATA QUALITY CHECK - Missing Values")
print("="*80)

missing_counts = df.select([
    spark_sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) 
    for c in df.columns
])
missing_counts.show(vertical=True)

# Convert columns to appropriate types
df_clean = df.withColumn("speed", col("speed").cast(DoubleType())) \
             .withColumn("SRI", col("SRI").cast(DoubleType())) \
             .withColumn("time", col("time").cast(DoubleType())) \
             .withColumn("Number_of_trips", col("Number_of_trips").cast(IntegerType())) \
             .withColumn("arrival_time", to_timestamp(col("arrival_time")))

# Handle Missing Values (Drop rows where critical metrics are null)
df_clean = df_clean.dropna(subset=["speed", "arrival_time", "SRI"])
after_null_drop = df_clean.count()

# Remove Invalid Rows (Negative speed or time)
df_clean = df_clean.filter((col("speed") >= 0) & (col("time") >= 0))
final_count = df_clean.count()

# Display cleaning summary
print("\n" + "="*80)
print("DATA CLEANING SUMMARY")
print("="*80)
print(f"Initial records:              {initial_count:,}")
print(f"After removing nulls:         {after_null_drop:,} (removed: {initial_count - after_null_drop:,})")
print(f"After removing invalid data:  {final_count:,} (removed: {after_null_drop - final_count:,})")
print(f"Total records removed:        {initial_count - final_count:,}")
print(f"Data retention rate:          {(final_count/initial_count)*100:.2f}%")

print("\nâœ… Data cleaning completed")


ðŸ“Š Initial record count: 66,913

DATA QUALITY CHECK - Missing Values
-RECORD 0-------------------
 stop_id_from         | 0   
 stop_id_to           | 0   
 trip_id              | 0   
 arrival_time         | 269 
 time                 | 0   
 speed                | 258 
 Number_of_trips      | 1   
 SRI                  | 313 
 Degree_of_congestion | 0   



NameError: name 'to_timestamp' is not defined

In [None]:
# 3. Feature Engineering
from pyspark.sql.functions import col, when, hour, lit

# A. Temporal Features (Hour of Day)
df_features = df_clean.withColumn("hour", hour("arrival_time"))

# B. Peak vs Off-Peak Classification
# Assuming Peak Hours: 07:00-11:00 (Morning) and 16:00-20:00 (Evening)
df_features = df_features.withColumn(
    "is_peak", 
    when(((col("hour") >= 7) & (col("hour") <= 11)) | 
         ((col("hour") >= 16) & (col("hour") <= 20)), 
         lit("Peak")
    ).otherwise(lit("Off-Peak"))
)

# C. Congestion Encoding (Ordinal Encoding)
df_features = df_features.withColumn(
    "congestion_lebel_encoded",
    when(col("Degree_of_congestion") == "Very smooth", 0)
    .when(col("Degree_of_congestion") == "Smooth", 1)
    .when(col("Degree_of_congestion") == "Moderate", 2)
    .when(col("Degree_of_congestion") == "Heavy congestion", 3)
    .otherwise(4) # Unknown or Extreme
)

# D. Speed Bands (Categorical Binning)
df_features = df_features.withColumn(
    "speed_band",
    when(col("speed") < 10, "Low (<10 km/h)")
    .when((col("speed") >= 10) & (col("speed") < 30), "Medium (10-30 km/h)")
    .otherwise("High (>30 km/h)")
)

# E. Trip Reliability Indicators (Requirement: trip reliability indicators)
df_features = df_features.withColumn(
    "reliability_status",
    when(col("SRI") > 2, "Unreliable (Congested)")
    .otherwise("Reliable")
)

print("\n" + "="*80)
print("FEATURE ENGINEERING RESULTS")
print("="*80)

# Show distribution of new features
print("\n Peak vs Off-Peak Distribution:")
df_features.groupBy("is_peak").count().orderBy("is_peak").show()

print("\n Speed Band Distribution:")
df_features.groupBy("speed_band").count().orderBy("speed_band").show()

print("\n Reliability Status Distribution:")
df_features.groupBy("reliability_status").count().orderBy("reliability_status").show()

print("\n Congestion Level Distribution:")
df_features.groupBy("Degree_of_congestion", "congestion_lebel_encoded").count().orderBy("congestion_lebel_encoded").show()

# Show sample with new features
print("\n" + "="*80)
print("SAMPLE DATA WITH NEW FEATURES")
print("="*80)
df_features.select(
    "trip_id", "arrival_time", "hour", "is_peak", 
    "speed", "speed_band", "SRI", "reliability_status",
    "Degree_of_congestion", "congestion_lebel_encoded"
).show(10, truncate=False)

print("âœ… Features Engineered successfully (including Reliability Indicators).")
# Cache the result
df_features = df_features.cache()

In [None]:
# 4. Route Level Aggregation (Requirement: route-level aggregates)
from pyspark.sql.functions import avg, count, stddev, col

# Calculating stats per route to save as a separate dataset for dashboards
route_stats = df_features.groupBy("trip_id").agg(
    avg("speed").alias("avg_speed"),
    avg("SRI").alias("avg_sri"),
    stddev("SRI").alias("sri_volatility"),
    count("*").alias("total_records")
)

print("\n" + "="*80)
print("ROUTE-LEVEL STATISTICS")
print("="*80)

# Show summary statistics
print("\n Overall Route Statistics Summary:")
route_stats.describe().show()

# Show top 10 routes by average speed
print("\n Top 10 Routes by Average Speed:")
route_stats.orderBy(col("avg_speed").desc()).show(10)

# Show top 10 routes by SRI (most unreliable)
print("\n Top 10 Most Unreliable Routes (Highest SRI):")
route_stats.orderBy(col("avg_sri").desc()).show(10)

print("âœ… Route aggregation completed")

In [None]:
# 5. Save Datasets to HDFS

# Save Silver Layer (Main Dataset)
print(f"\n Saving Silver Dataset to HDFS: {SILVER_OUTPUT_PATH}")
df_features.write.mode("overwrite").partitionBy("is_peak").parquet(SILVER_OUTPUT_PATH)
print("âœ… Silver layer saved successfully.")

# Save Route Stats (Aggregated Dataset for Dashboard)
print(f"\n Saving Route Stats to HDFS: {ROUTE_STATS_OUTPUT_PATH}")
route_stats.write.mode("overwrite").parquet(ROUTE_STATS_OUTPUT_PATH)
print("âœ… Route Stats saved successfully.")

# Verification - Read back and show counts
print("\n" + "="*80)
print("VERIFICATION - Reading saved data")
print("="*80)

silver_verify = spark.read.parquet(SILVER_OUTPUT_PATH)
route_verify = spark.read.parquet(ROUTE_STATS_OUTPUT_PATH)

print(f"\nâœ… Silver dataset record count: {silver_verify.count():,}")
print(f"âœ… Route stats record count: {route_verify.count():,}")

print("\n Silver Dataset Partitions:")
silver_verify.groupBy("is_peak").count().show()

In [None]:
# 6. Completion Summary
print("\n" + "="*80)
print("NOTEBOOK 02 COMPLETION SUMMARY")
print("="*80)
print("âœ… Data loaded from Bronze layer")
print("âœ… Data cleaning completed (type casting, null handling, invalid data removal)")
print("âœ… Feature engineering completed:")
print("   - Temporal features (hour)")
print("   - Peak/Off-Peak classification")
print("   - Congestion level encoding")
print("   - Speed bands")
print("   - Trip reliability indicators")
print("âœ… Route-level aggregations computed")
print("âœ… Silver dataset saved to HDFS (partitioned by is_peak)")
print("âœ… Route statistics saved to HDFS")
print("\nStopping Spark session...")
spark.stop()
print("âœ… Spark session stopped.")

In [None]:
# 6. Verification
print("âœ… Notebook 02 Complete. Stopping Spark.")
spark.stop()