In [0]:
# ==========================================
# EDA FOR GOLD ML DATASET
# ==========================================

from pyspark.ml.functions import vector_to_array
from pyspark.sql import functions as F
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid", palette="Set2", font_scale=1.1)

# ==========================================
# LOAD AND INSPECT DATA
# ==========================================
print("ðŸ“¥ Loading Gold ML Features Table...")
df_gold = spark.table("default.gold_ml_features_experimental")

print("\nDATA OVERVIEW")
print("="*70)
print(f"Total Records: {df_gold.count():,}")
print(f"Columns: {len(df_gold.columns)}")
df_gold.printSchema()

display(df_gold.limit(5))

In [0]:
# ==========================================
# LABEL DISTRIBUTION
# ==========================================
print("\nðŸŽ¯ Label Distribution:")

label_counts = (
    df_gold.groupBy("label")
    .count()
    .orderBy("label")
    .toPandas()
)

total_labels = label_counts["count"].sum()
label_counts["percent"] = (label_counts["count"] / total_labels * 100).round(2)

display(label_counts)

plt.figure(figsize=(8, 5))
sns.barplot(
    data=label_counts,
    x="label",
    y="count",
    palette="Set2"
)
plt.title("Distribution of Target Labels")
plt.xlabel("Label (Indexed Arrival Status)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [0]:
# ==========================================
# FEATURE VECTOR ANALYSIS
# ==========================================
print("\nðŸ“Š Feature Vector Summary:")

# Convert feature vector to array for exploration
df_gold_array = df_gold.withColumn("features_array", vector_to_array("features"))

# Get feature vector length from the first row
feature_length = len(df_gold_array.select("features_array").first()[0])
print(f"Number of features after encoding/scaling: {feature_length}")

# Compute summary statistics across all features (sampled)
df_sample = df_gold_array.sample(fraction=0.01, seed=42).toPandas()
features_matrix = pd.DataFrame(df_sample["features_array"].tolist())

print("\nFeature Summary Statistics (sample of 1% of data):")
display(features_matrix.describe().T.head(10))


In [0]:
# ==========================================
# CHECK FEATURE SCALING
# ==========================================
mean_vals = features_matrix.mean()
std_vals = features_matrix.std()

plt.figure(figsize=(10, 5))
plt.plot(mean_vals, label="Mean", color="steelblue")
plt.plot(std_vals, label="Std. Dev.", color="orange")
plt.title("Feature Scaling Validation (Mean â‰ˆ 0, Std â‰ˆ 1)")
plt.xlabel("Feature Index")
plt.ylabel("Value")
plt.legend()
plt.tight_layout()
plt.show()

print(f"Average of feature means: {mean_vals.mean():.4f}")
print(f"Average of feature stds: {std_vals.mean():.4f}")


In [0]:
# ==========================================
# FEATURE CORRELATION SNAPSHOT (SAMPLED)
# ==========================================
# Optional: compute correlation matrix for a small sample of features
corr_sample = features_matrix.iloc[:, :15]  # just first 15 features to keep visualization light
corr_matrix = corr_sample.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap (First 15 Features)")
plt.show()


In [0]:
# ==========================================
# CLASS BALANCE AND TRAINING READINESS
# ==========================================
print("\nâœ… Data Quality and Balance Check")

total_records = df_gold.count()
missing_features = df_gold.filter(F.col("features").isNull()).count()
missing_labels = df_gold.filter(F.col("label").isNull()).count()

print(f"Total Records: {total_records:,}")
print(f"Missing Feature Vectors: {missing_features}")
print(f"Missing Labels: {missing_labels}")

# Class balance chart (percent)
plt.figure(figsize=(8, 5))
sns.barplot(
    data=label_counts,
    x="label",
    y="percent",
    palette="muted"
)
plt.title("Class Balance (% of Total)")
plt.xlabel("Label")
plt.ylabel("Percentage of Dataset")
plt.tight_layout()
plt.show()
