In [32]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path

# create folders
RAW = Path("data/raw"); FIGS = Path("figures"); REPORTS = Path("reports")
for p in [RAW, FIGS, REPORTS]:
    p.mkdir(parents=True, exist_ok=True)

np.random.seed(8)
pd.set_option("display.max_columns", 100)

In [33]:
n = 160
df = pd.DataFrame({
    "date": pd.date_range("2021-02-01", periods=n, freq="D"),
    "region": np.random.choice(["North","South","East","West"], size=n),
    "age": np.random.normal(40, 8, size=n).clip(22, 70).round(1),
    "income": np.random.lognormal(mean=10.6, sigma=0.3, size=n).round(2),
    "transactions": np.random.poisson(lam=3, size=n),
})
base = df["income"]*0.0015 + df["transactions"]*18 + np.random.normal(0,40,size=n)
df["spend"] = np.maximum(0, base).round(2)

# inject missingness and outliers
df.loc[np.random.choice(df.index, 5, replace=False), "income"] = np.nan
df.loc[np.random.choice(df.index, 3, replace=False), "spend"] = np.nan
df.loc[np.random.choice(df.index, 2, replace=False), "transactions"] = df["transactions"].max()+12

# save raw
csv_path = RAW/"eda_homework.csv"
df.to_csv(csv_path, index=False)
print("Wrote:", csv_path)
df.head()

Wrote: data/raw/eda_homework.csv


Unnamed: 0,date,region,age,income,transactions,spend
0,2021-02-01,West,37.6,28086.81,4,73.35
1,2021-02-02,North,43.0,33034.75,1,52.37
2,2021-02-03,South,38.2,50045.39,2,131.85
3,2021-02-04,South,24.9,39467.28,4,147.58
4,2021-02-05,South,59.8,31201.65,1,86.76


In [34]:
df.info()
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          160 non-null    datetime64[ns]
 1   region        160 non-null    object        
 2   age           160 non-null    float64       
 3   income        155 non-null    float64       
 4   transactions  160 non-null    int64         
 5   spend         157 non-null    float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(1)
memory usage: 7.6+ KB


date            0
region          0
age             0
income          5
transactions    0
spend           3
dtype: int64

In [35]:
num_cols = ["age","income","transactions","spend"]
desc = df[num_cols].describe().T
desc["skew"] = df[num_cols].skew()
desc["kurtosis"] = df[num_cols].kurtosis()
desc

# save to reports
desc.to_csv(REPORTS/"stage08_numeric_profile.csv")
df.isna().sum().to_csv(REPORTS/"stage08_missing_counts.csv")

In [36]:
def save_current(fig, name):
    out = FIGS/name
    fig.tight_layout()
    fig.savefig(out, dpi=150)
    plt.close(fig)
    print("Saved", out)

# A) Histograms
for col in ["income","spend","age"]:
    fig, ax = plt.subplots(figsize=(6,4))
    ax.hist(df[col].dropna(), bins=30, alpha=0.8)
    ax.set_title(f"{col} distribution")
    ax.set_xlabel(col); ax.set_ylabel("count")
    save_current(fig, f"hist_{col}.png")

# B) Boxplot
fig, ax = plt.subplots(figsize=(5,4))
ax.boxplot(df["transactions"].dropna(), vert=True, labels=["transactions"])
ax.set_title("Transactions outliers")
save_current(fig, "box_transactions.png")

# C) Scatter relationships
pairs = [("income","spend"), ("age","spend")]
for x,y in pairs:
    fig, ax = plt.subplots(figsize=(6,4))
    ax.scatter(df[x], df[y], s=14, alpha=0.7)
    ax.set_title(f"{y} vs {x}")
    ax.set_xlabel(x); ax.set_ylabel(y)
    save_current(fig, f"scatter_{y}_vs_{x}.png")

# D) Correlation heatmap
corr = df[num_cols].corr(numeric_only=True)
fig, ax = plt.subplots(figsize=(6,5))
cax = ax.imshow(corr.values, cmap="coolwarm", vmin=-1, vmax=1)
ax.set_xticks(range(len(corr.columns))); ax.set_xticklabels(corr.columns, rotation=45, ha="right")
ax.set_yticks(range(len(corr.columns))); ax.set_yticklabels(corr.index)
fig.colorbar(cax)
ax.set_title("Correlation matrix")
save_current(fig, "corr_matrix.png")

# also save correlation table
corr.to_csv(REPORTS/"stage08_correlation.csv")

Saved figures/hist_income.png
Saved figures/hist_spend.png
Saved figures/hist_age.png
Saved figures/box_transactions.png
Saved figures/scatter_spend_vs_income.png
Saved figures/scatter_spend_vs_age.png
Saved figures/corr_matrix.png


  ax.boxplot(df["transactions"].dropna(), vert=True, labels=["transactions"])


## Findings

- **Distributions:**  
  - *Income* is right-skewed (typical of lognormal).  
  - *Spend* is slightly right-tailed with a few high values.  
  - *Transactions* mostly cluster around 3–4 but show some extreme outliers.  
  - *Age* is approximately bell-shaped, clipped between 22–70.  

- **Missingness:**  
  - A few missing values in *income* and *spend*.  
  - No structural missingness by date or region.  

- **Relationships:**  
  - *Income vs Spend* shows a positive relationship with scatter widened by noise.  
  - *Age vs Spend* shows no strong pattern, suggesting age may not drive spending much.  
  - *Region* shifts the income–spend relationship slightly, showing some group differences.  

- **Correlation:**  
  - Spend correlates positively with both *income* and *transactions*.  
  - Transactions and income are only weakly related, meaning they provide complementary information.

## So What

- The right-skew in income and spend suggests considering **log transforms** before fitting linear models.  
- Outliers in transactions could distort models. Options: **cap, winsorize, or robust models**.  
- Region appears to segment behavior; including it as a categorical feature could improve model accuracy.  
- Missing values in income/spend must be addressed before downstream analysis (imputation or removal).

## Now What

- **Preprocessing:**  
  - Impute missing *income* and *spend* using median or region-based averages.  
  - Consider log-transforming *income* and *spend*.  
  - Decide whether to treat extreme *transactions* values as errors or real high-frequency shoppers.  

- **Feature engineering:**  
  - Create interaction terms (e.g., region × transactions) to capture group-specific effects.  
  - Encode region as categorical dummies for regression models.  

- **Next steps:**  
  - Explore time-based seasonality using the *date* column (weekly or monthly spend patterns).  
  - Build initial baseline regression models (spend ~ income + transactions + age + region).