## Data Ingestion & Optimization

**Objective:** Raw CSVs are slow and heavy. We need to load them using PySpark (because Pandas will crash with Memory Errors) and convert them to Parquet.

In [2]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, count, when, isnan, stddev

In [3]:
# --- INDUSTRY TECHNIQUE: CONFIGURING SPARK ---
# We configure the driver memory to ensure we don't OOM (Out of Memory) locally.
# In a cluster (Databricks/AWS EMR), this is handled by the cluster manager.
spark = SparkSession.builder \
    .appName("Backblaze_Failure_Prediction") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

print("Spark Session Created")

PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

In [None]:
# --- PATHS ---
# Update this to where you unzipped the data
RAW_DATA_PATH = "./data/data_Q3_2025/*.csv" 
OUTPUT_PATH = "./data/parquet_Q3_2025"

In [None]:
# --- STEP 1: READ RAW DATA ---
# We use *.csv to tell Spark to read ALL files in that directory as one DataFrame.
# inferSchema=True is convenient but slow. In production, we define schemas manually.
print("Reading CSVs... this might take a minute.")
df = spark.read.csv(RAW_DATA_PATH, header=True, inferSchema=True)

In [None]:
# --- STEP 2: INITIAL INSPECTION ---
print(f"Total Rows: {df.count()}")
print("Schema:")
df.printSchema()

In [None]:
# Let's check the Class Imbalance immediately
print("Distribution of Failures (0 = Healthy, 1 = Failed):")
df.groupBy("failure").count().show()

In [None]:
# --- STEP 3: DATA CLEANING (LIGHT) ---
# 1. Cast date column to actual date type
df = df.withColumn("date", to_date(col("date"), "yyyy-MM-dd"))

In [None]:
# Filter specifically for Seagate drives (Industry Standard: Don't mix manufacturers in one model initially)
# Let's look at the most popular model in the dataset first to decide.
print("Top 5 Hard Drive Models:")
df.groupBy("model").count().orderBy(col("count").desc()).show(5)

1. Filter for our Manufacturer (Seagate).
2. Calculate the percentage of missing values for every column.
3. Drop columns that are >90% Empty.
4. Drop columns that have zero variance (the value is always "0" or always "100").

In [None]:
#Filter for Seagate (ST) drives immediately
# We do this because different manufacturers use different columns.
# Analyzing "Nulls" across mixed manufacturers is misleading.

# (Assuming 'ST4000DM000' or similar Seagate is top. Let's filter for just Seagate to reduce noise)
# Note: You might need to adjust the model string based on the print output above.

df_seagate = df.filter(col("model").contains("ST"))

print(f"Seagate Drives Count: {df_seagate.count()}")

In [None]:
#INDUSTRY TECHNIQUE: AUTOMATED NULL ANALYSIS
# We want to find which columns are actually populated.
print("Calculating Null percentages per column... (This takes time)")

# Get total row count for calculation
total_rows = df_seagate.count()

# Create a list to store valid columns
valid_features = []

# Loop through all columns (skip date, serial, model, failure for now)
skip_cols = ['date', 'serial_number', 'model', 'failure', 'capacity_bytes']
candidate_cols = [c for c in df_seagate.columns if c not in skip_cols]

In [None]:

# Note: In PySpark, doing a loop like this can be slow if not careful.
# We will do a single aggregation pass for efficiency.
aggregations = []
for c in candidate_cols:
    # Count how many Nulls or NaNs are in this column
    aggregations.append(
        count(when(isnan(c) | col(c).isNull(), c)).alias(c)
    )

# Run the query
null_counts = df_seagate.select(aggregations).collect()[0]

# Threshold: If more than 30% of data is missing, drop the column.
# In hardware, if a sensor works 99% of the time, it's useful. 
# If it only reports 70% of the time, it's garbage.
DROP_THRESHOLD = 0.3 


In [None]:
print("\n--- Feature Selection Report ---")
for c in candidate_cols:
    null_count = null_counts[c]
    null_pct = null_count / total_rows
    
    if null_pct < DROP_THRESHOLD:
        print(f"[KEEP] {c}: {null_pct:.2%} missing")
        valid_features.append(c)
    else:
        # We don't print these to keep output clean, but they are dropped.
        pass

print(f"\nSelected {len(valid_features)} features based on data density.")

In [None]:
# 4. FEATURE SELECTION STAGE 2: VARIANCE CHECK
print("\n--- STAGE 2: VARIANCE CHECK ---")
# If Standard Deviation is 0, the data never changes. It's useless.
# We cast to DoubleType to ensure precision during calculation.

std_expr = [stddev(col(c).cast(DoubleType())).alias(c) for c in aggregations]

# This is a heavy computation, it scans the whole dataset again.
print("Calculating Standard Deviation for surviving features...")
std_devs = df_seagate.select(std_expr).collect()[0]

final_features = []
for c in aggregations:
    sd = std_devs[c]
    # Keep if sd is not None and sd > 0 (has some variance)
    # We use a tiny threshold (0.0001) to avoid floating point weirdness
    if sd is not None and sd > 0.0001:
        final_features.append(c)
        # Optional: Print to see what we are keeping
        # print(f"[KEEP] {c} (StdDev: {sd:.2f})")
    else:
        print(f"[DROP] {c} (Zero Variance)")

print("-" * 30)
print(f"Original SMART Features: {len(candidate_cols)}")
print(f"Post-Null Check:         {len(aggregations)}")
print(f"Final Selected Features: {len(final_features)}")

In [None]:
# 4. Final Schema Construction
base_cols = ['date', 'serial_number', 'model', 'failure', 'capacity_bytes']
final_cols = base_cols + final_features

df_final = df_seagate.select(final_cols)

In [None]:
# 5. Write to Parquet
print("Writing cleaned dataset to Parquet...")
df_final.write.mode("overwrite").parquet(OUTPUT_PATH)
print("Done.")