**Deliverables in this notebook**
- Aggregation & metrics per family (sales, seasonality index, promo rate, promo lift, sparsity)
- Required visualizations (bar, pie, stacked area, heatmap, boxplots, promo comparison, trends)
- Answers to analysis questions (80/20, seasonality, promotion response, growth/decline, variance, correlations)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


Matplotlib is building the font cache; this may take a moment.


In [3]:
# ====== 1) Load data ======
# Set this path to where your train.csv lives.

PATH_TRAIN = "train.csv"

usecols = ["id", "date", "store_nbr", "family", "sales", "onpromotion"]
dtypes = {
    "id": "int32",
    "store_nbr": "int16",
    "family": "category",
    "sales": "float32",
    "onpromotion": "float32",  # keep float to allow missing
}

train = pd.read_csv(PATH_TRAIN, usecols=usecols, dtype=dtypes, parse_dates=["date"])
train["promo_flag"] = (train["onpromotion"].fillna(0) > 0).astype("int8")

print("Shape:", train.shape)
print("Date range:", train["date"].min().date(), "→", train["date"].max().date())
print("Families:", train["family"].nunique())

train.head(5)

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
# Quick checks (as in template style)
train.info()
train.describe(include="all")
train.isnull().sum()

## 2) Aggregations (family-day) + core metrics

We build a **family-day** panel (1 row = 1 family on 1 date, aggregated over all stores).
This is the basis for:
- trends and stacked-area plots
- seasonality heatmap (family × month)
- zero-sales day rate (sparsity)


In [None]:
# 2) Family-day aggregation
sales = train["sales"].to_numpy()
promo = train["promo_flag"].to_numpy()

tmp = train[["date", "family"]].copy()
tmp["daily_sales"] = sales
tmp["promo_sales"] = sales * promo
tmp["nonpromo_sales"] = sales * (1 - promo)
tmp["promo_any"] = promo

daily = tmp.groupby(["date", "family"], as_index=False).agg(
    daily_sales=("daily_sales", "sum"),
    promo_sales=("promo_sales", "sum"),
    nonpromo_sales=("nonpromo_sales", "sum"),
    promo_any=("promo_any", "max"),
)

# Create full grid (all dates × all families) to measure sparsity consistently
all_dates = pd.date_range(train["date"].min(), train["date"].max(), freq="D")
all_families = train["family"].cat.categories

grid = pd.MultiIndex.from_product([all_dates, all_families], names=["date","family"]).to_frame(index=False)
daily_full = grid.merge(daily, on=["date","family"], how="left")

for c in ["daily_sales","promo_sales","nonpromo_sales","promo_any"]:
    daily_full[c] = daily_full[c].fillna(0)
daily_full["promo_any"] = daily_full["promo_any"].astype("int8")

daily_full["month"] = daily_full["date"].dt.month
daily_full["year"] = daily_full["date"].dt.year

daily_full.head()

In [None]:
# 3) Family-level metrics
# (A) Sales & sparsity & variance (daily)
family_daily_metrics = daily_full.groupby("family").agg(
    total_sales=("daily_sales","sum"),
    avg_daily_sales=("daily_sales","mean"),
    median_daily_sales=("daily_sales","median"),
    std_daily_sales=("daily_sales","std"),
    zero_day_rate=("daily_sales", lambda s: (s==0).mean()),
    promo_day_rate=("promo_any","mean"),
    promo_sales_share=("promo_sales", lambda s: s.sum()/(daily_full.loc[s.index,"daily_sales"].sum()+1e-9))
).reset_index()

family_daily_metrics["cv_daily"] = family_daily_metrics["std_daily_sales"]/(family_daily_metrics["avg_daily_sales"]+1e-9)

# (B) Promo effectiveness (record-level; more stable than day-level for big dataset)
g = train.groupby("family")

record_metrics = g.agg(
    n_records=("sales","size"),
    total_sales_record=("sales","sum"),
    promo_record_rate=("promo_flag","mean"),
).reset_index()

promo_means = g.apply(lambda df: pd.Series({
    "mean_sales_promo_record": df.loc[df["promo_flag"]==1, "sales"].mean(),
    "mean_sales_nonpromo_record": df.loc[df["promo_flag"]==0, "sales"].mean(),
    "promo_sales_sum": df.loc[df["promo_flag"]==1, "sales"].sum(),
})).reset_index()

record_metrics = record_metrics.merge(promo_means, on="family", how="left")
record_metrics["promo_sales_share_record"] = record_metrics["promo_sales_sum"]/(record_metrics["total_sales_record"]+1e-9)
record_metrics["promo_lift_record"] = (record_metrics["mean_sales_promo_record"]/(record_metrics["mean_sales_nonpromo_record"]+1e-9))-1

# (C) Family availability window (first/last positive day)
first_pos = daily_full.groupby("family").apply(lambda g: g.loc[g["daily_sales"]>0, "date"].min()).reset_index(name="first_positive_date")
last_pos  = daily_full.groupby("family").apply(lambda g: g.loc[g["daily_sales"]>0, "date"].max()).reset_index(name="last_positive_date")

# Merge all metrics
family_metrics = (family_daily_metrics
                  .merge(record_metrics, on="family", how="left")
                  .merge(first_pos, on="family", how="left")
                  .merge(last_pos, on="family", how="left")
                 )

family_metrics["sales_share"] = family_metrics["total_sales"] / family_metrics["total_sales"].sum()
family_metrics = family_metrics.sort_values("total_sales", ascending=False)

family_metrics.head(10)

In [None]:
# Save metrics table for reporting
family_metrics.to_csv("family_metrics.csv", index=False)
print("Saved: family_metrics.csv")

## 3) Required Visualizations

In [None]:
# Helper: choose top K families by total sales
TOP15 = family_metrics.head(15)["family"].tolist()
TOP10 = family_metrics.head(10)["family"].tolist()
TOP8  = family_metrics.head(8)["family"].tolist()
TOP5  = family_metrics.head(5)["family"].tolist()


### 3.1 Bar chart — Top 15 families by total sales

In [None]:
top15 = family_metrics.head(15).sort_values("total_sales", ascending=True)

plt.figure(figsize=(10,6))
plt.barh(top15["family"].astype(str), top15["total_sales"])
plt.title("Top 15 Product Families by Total Sales")
plt.xlabel("Total Sales")
plt.tight_layout()
plt.show()

### 3.2 Pie chart — Sales composition (Top 10 + Other)

In [None]:
K = 10
topk = family_metrics.head(K).copy()
other_sales = family_metrics["total_sales"].iloc[K:].sum()

labels = list(topk["family"].astype(str)) + ["Other"]
sizes = list(topk["total_sales"]) + [other_sales]

plt.figure(figsize=(8,8))
plt.pie(sizes, labels=labels, autopct="%1.1f%%")
plt.title(f"Sales Composition (Top {K} + Other)")
plt.tight_layout()
plt.show()

### 3.3 Stacked area — Sales contribution by family over time (Top 8 + Other)

In [None]:
ts = daily_full.pivot_table(index="date", columns="family", values="daily_sales", aggfunc="sum").sort_index()
ts_top = ts[TOP8].copy()
ts_top["Other"] = ts.drop(columns=TOP8).sum(axis=1)

plt.figure(figsize=(12,6))
plt.stackplot(ts_top.index, ts_top.T.values, labels=[str(c) for c in ts_top.columns])
plt.title("Sales Contribution Over Time (Top 8 + Other)")
plt.xlabel("Date"); plt.ylabel("Daily Sales")
plt.legend(loc="upper left", ncol=3, fontsize=8)
plt.tight_layout()
plt.show()

### 3.4 Heatmap — Family × Month seasonality index (Top 20)

In [None]:
# Seasonality index = (mean daily sales in month) / (overall mean daily sales)
month_mean = daily_full.groupby(["family","month"])["daily_sales"].mean().reset_index(name="month_mean_daily_sales")
overall_mean = daily_full.groupby("family")["daily_sales"].mean().reset_index(name="overall_mean_daily_sales")
month_mean = month_mean.merge(overall_mean, on="family", how="left")
month_mean["seasonality_index"] = month_mean["month_mean_daily_sales"]/(month_mean["overall_mean_daily_sales"]+1e-9)

heat = month_mean.pivot(index="family", columns="month", values="seasonality_index")
top20 = family_metrics.head(20)["family"].tolist()
heat = heat.loc[top20]

plt.figure(figsize=(12,7))
plt.imshow(heat.values, aspect="auto")
plt.colorbar(label="Seasonality Index (month_mean / overall_mean)")
plt.yticks(range(len(heat.index)), [str(x) for x in heat.index])
plt.xticks(range(12), list(range(1,13)))
plt.title("Seasonality Heatmap (Top 20 Families)")
plt.tight_layout()
plt.show()

### 3.5 Box plots — Daily sales distribution (Top 10 families)

In [None]:
box_data = [daily_full.loc[daily_full["family"]==f, "daily_sales"].values for f in TOP10]

plt.figure(figsize=(12,6))
plt.boxplot(box_data, labels=[str(f) for f in TOP10], showfliers=False)
plt.xticks(rotation=45, ha="right")
plt.title("Daily Sales Distribution (Top 10 Families)")
plt.ylabel("Daily Sales (aggregated across stores)")
plt.tight_layout()
plt.show()

### 3.6 Promo comparison — Sales with vs without promotion (Top 10 families)

In [None]:
# Compare mean sales per record for promo vs non-promo (Top 10)
sub = train[train["family"].isin(TOP10)].copy()
cmp = sub.groupby(["family","promo_flag"])["sales"].mean().reset_index()

pivot = cmp.pivot(index="family", columns="promo_flag", values="sales").rename(columns={0:"No Promo", 1:"Promo"}).fillna(0)
pivot = pivot.loc[TOP10]  # keep order

x = np.arange(len(pivot.index))
width = 0.42

plt.figure(figsize=(12,5))
plt.bar(x - width/2, pivot["No Promo"], width, label="No Promo")
plt.bar(x + width/2, pivot["Promo"], width, label="Promo")
plt.xticks(x, [str(i) for i in pivot.index], rotation=45, ha="right")
plt.title("Mean Sales per Record: Promo vs No Promo (Top 10 Families)")
plt.ylabel("Mean sales per record")
plt.legend()
plt.tight_layout()
plt.show()

### 3.7 Line charts — Trends of top families over time (7-day rolling mean)

In [None]:
# Smooth with rolling mean for readability
ts_top10 = daily_full[daily_full["family"].isin(TOP10)].pivot_table(index="date", columns="family", values="daily_sales", aggfunc="sum").sort_index()
ts_smooth = ts_top10.rolling(7, min_periods=1).mean()

plt.figure(figsize=(12,6))
for f in TOP10:
    plt.plot(ts_smooth.index, ts_smooth[f], label=str(f))
plt.title("Trends of Top 10 Families (7-day rolling mean)")
plt.xlabel("Date"); plt.ylabel("Daily Sales (smoothed)")
plt.legend(ncol=2, fontsize=8)
plt.tight_layout()
plt.show()

## 4) Analysis Questions (computed outputs)

We compute the key answers programmatically so the notebook stays reproducible.


In [None]:
# 80/20 rule
fm = family_metrics.copy()
fm["cum_share"] = fm["sales_share"].cumsum()
n80 = int(np.argmax(fm["cum_share"].to_numpy() >= 0.80)) + 1
core80 = fm.head(n80)[["family","total_sales","sales_share","cum_share"]]

print("Families needed to reach ~80% of sales:", n80)
core80

In [None]:
# Strong seasonality families (top by seasonality range)
season_strength = month_mean.groupby("family")["seasonality_index"].agg(["min","max"]).reset_index()
season_strength["seasonality_range"] = season_strength["max"] - season_strength["min"]
season_strength = season_strength.sort_values("seasonality_range", ascending=False)

season_strength.head(10)

In [None]:
# Promo responders among top sales families (rank by promo_lift_record)
top15 = family_metrics.head(15)["family"].tolist()
promo_rank = family_metrics[family_metrics["family"].isin(top15)][
    ["family","total_sales","promo_record_rate","promo_sales_share_record","promo_lift_record"]
].sort_values("promo_lift_record", ascending=False)

promo_rank

In [None]:
# Growth / decline (annual totals 2014→2016)
daily_full["year"] = daily_full["date"].dt.year
annual = daily_full.groupby(["year","family"], as_index=False)["daily_sales"].sum().rename(columns={"daily_sales":"annual_sales"})
annual_pivot = annual.pivot(index="family", columns="year", values="annual_sales")

growth_2014_2016 = ((annual_pivot[2016]/(annual_pivot[2014]+1e-9))-1).sort_values(ascending=False)
growth_2014_2016.head(10), growth_2014_2016.tail(10)

In [None]:
# High variance / sparsity
family_metrics.sort_values("cv_daily", ascending=False)[
    ["family","total_sales","zero_day_rate","cv_daily","first_positive_date"]
].head(12)

In [None]:
# Cross-family correlations (top 12 families)
top12 = family_metrics.head(12)["family"].tolist()
pivot = daily_full[daily_full["family"].isin(top12)].pivot_table(index="date", columns="family", values="daily_sales", aggfunc="sum").sort_index()
corr = pivot.corr()

plt.figure(figsize=(9,7))
plt.imshow(corr.values, aspect="auto")
plt.colorbar(label="Correlation")
plt.xticks(range(len(top12)), [str(x) for x in corr.columns], rotation=90)
plt.yticks(range(len(top12)), [str(x) for x in corr.index])
plt.title("Cross-Family Correlation (Top 12)")
plt.tight_layout()
plt.show()

# List top correlated pairs
pairs=[]
cols=corr.columns
for i in range(len(cols)):
    for j in range(i+1,len(cols)):
        pairs.append((cols[i], cols[j], float(corr.iloc[i,j])))
pairs_sorted = sorted(pairs, key=lambda x: abs(x[2]), reverse=True)
pairs_sorted[:10]