EDA Notebook Template (starter cells)

In [None]:
# Spark Setup
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MarketingEDA").getOrCreate()

In [None]:
# Load data from Delta (post-transform)
df = spark.read.format("delta").load("../data/processed/clean_delta/")

df.printSchema()
df.show(5)

In [None]:
# Descriptive stats
df.describe().show()

In [None]:
# Null value check (basic)
from pyspark.sql.functions import col, isnan, when, count

df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [None]:
# Visualisation example (Visualise with Pandas + Seaborn)
import matplotlib.pyplot as plt
import seaborn as sns

df_pd = df.select("Age", "TotalSpend", "Education", "Response").toPandas()

plt.figure(figsize=(10, 6))
sns.barplot(data=df_pd, x="Education", y="TotalSpend", estimator=sum, ci=None)
plt.title("Total Spend by Education Level")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Customer Age Distribution

In [None]:
# Create 'age' column from birth year
df['age'] = 2025 - df['year_birth']

# Plot distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['age'], bins=30, kde=True)
plt.title("Customer Age Distribution")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()

Income vs Spending

In [None]:
# Scatterplot of income vs total amount spent
df['total_spent'] = df[['mntwines', 'mntfruits', 'mntmeatproducts', 'mntfishproducts', 'mntsweetproducts', 'mntgoldprods']].sum(axis=1)

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='income', y='total_spent')
plt.title("Income vs Total Spending")
plt.xlabel("Income")
plt.ylabel("Total Spent")
plt.show()

Spending by Product Category

In [None]:
product_cols = ['mntwines', 'mntfruits', 'mntmeatproducts', 'mntfishproducts', 'mntsweetproducts', 'mntgoldprods']

df[product_cols].sum().sort_values(ascending=False).plot(kind='bar', figsize=(10, 6))
plt.title("Total Spend by Product Category")
plt.ylabel("Amount Spent")
plt.xticks(rotation=45)
plt.show()

Clean Column Names (early in notebook)

In [None]:
df.columns = df.columns.str.lower().str.replace(" ", "_")

Normalize Features (to remove scale bias)

Choose features relevant to customer behavior:

In [None]:
from sklearn.preprocessing import StandardScaler

features = [
    'income', 'age', 'recency', 'total_spent',
    'numwebvisitsmonth', 'numstorepurchases', 'numwebpurchases'
]

df_scaled = df[features].dropna()  # Drop NAs for now
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_scaled)

Run PCA to reduce dimensions for easy plotting

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Add PCA columns back for plotting
df_pca = df_scaled.copy()
df_pca['pca1'] = X_pca[:, 0]
df_pca['pca2'] = X_pca[:, 1]

Visualize clusters (e.g. with KMeans) — Visualize in PCA Space

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_pca, x='pca1', y='pca2')
plt.title("PCA Projection of Customer Segments")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.show()

Apply KMeans Clustering

In [None]:
from sklearn.cluster import KMeans

# We'll go with 4 clusters to start — you can tweak this
kmeans = KMeans(n_clusters=4, random_state=42, n_init='auto')
df_pca['cluster'] = kmeans.fit_predict(X_scaled)

Plot Clusters in PCA Space

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=df_pca, x='pca1', y='pca2', hue='cluster', palette='Set2', s=60
)
plt.title("KMeans Clusters in PCA Projection")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend(title="Cluster")
plt.show()

Explore Cluster Characteristics

In [None]:
# Add cluster labels back to original DataFrame (optional but helpful)
df_with_clusters = df.copy()
df_with_clusters['cluster'] = kmeans.labels_

# Get mean values per cluster
cluster_summary = df_with_clusters.groupby('cluster')[
    ['income', 'age', 'recency', 'total_spent', 'numwebpurchases', 'numstorepurchases']
].mean()

cluster_summary