In [3]:
!pip install pandas matplotlib



In [26]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

import sys
sys.path.append(str(Path.cwd() / "../src/"))
from utils import get_summary_stats, choose_group_col, group_by_and_agg


In [None]:
import numpy as np

n = 200_000
a = np.arange(n, dtype=np.float64)

%timeit a * 2
vec = a * 2

def loop_double(x):
    out = np.empty_like(x)
    for i, v in enumerate(x):
        out[i] = v * 2
    return out

%timeit loop_double(a)
print("first 5 vectorized:", vec[:5])


88 μs ± 1.43 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
data_path = Path(os.getenv("DATA_DIR", "data")) / "starter_data.csv"
print("Looking for:", data_path)

if not data_path.exists():
    raise FileNotFoundError(f"{data_path} not found. Place starter_data.csv at this path.")

df = pd.read_csv(data_path)
print("Shape:", df.shape)
display(df.info())
display(df.head())


In [None]:
summary = get_summary_stats(df)
display(summary)

out_dir = Path("data/processed")
out_dir.mkdir(parents=True, exist_ok=True)
summary.to_csv(out_dir / "summary.csv")
summary.to_json(out_dir / "summary.json", orient="table")
print("Saved summary.csv and summary.json to", out_dir)


In [None]:
group_col = 'category' if 'category' in df.columns else choose_group_col(df)
print("Using group column:", group_col)

if group_col is not None:
    grouped = group_by_and_agg(df, group_col)
    display(grouped.head())
    grouped.to_csv(out_dir / f"grouped_by_{group_col}.csv", index=False)
    print("Saved grouped results to", out_dir / f"grouped_by_{group_col}.csv")
else:
    print("No suitable categorical column found; skipping groupby.")


In [15]:
# Bonus: basic plot (first numeric column)
import matplotlib.pyplot as plt

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if num_cols:
    col = num_cols[0]
    plt.figure(figsize=(6,4))
    df[col].hist(bins=30)
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("count")
    plt.tight_layout()
    plot_path = out_dir / f"{col}_hist.png"
    plt.savefig(plot_path)
    plt.close()
    print("Saved plot to", plot_path)
else:
    print("No numeric columns to plot.")


Saved plot to data/processed/value_hist.png
