In [None]:
# ────────────────────────────────────────────────────────────────
# 01_data_prep.ipynb
# Create clean dataset for modeling (usually 2012–2022)
# ────────────────────────────────────────────────────────────────

import pandas as pd
import numpy as np

# ─── Load ───────────────────────────────────────────────────────────
df = pd.read_csv("../data/raw/brent_prices.csv")

# Guess date & price columns (same as before)
date_col = next(c for c in df.columns if "date" in c.lower() or "day" in c.lower())
price_col = next(c for c in df.columns if "price" in c.lower() or "close" in c.lower() or "brent" in c.lower())

df[date_col] = pd.to_datetime(df[date_col], errors="coerce", dayfirst=True)
df = df.dropna(subset=[date_col]).sort_values(date_col).set_index(date_col)

# Keep only needed columns
df = df[[price_col]].rename(columns={price_col: "price"})

# Create features
df["log_price"] = np.log(df["price"])
df["log_return"] = df["log_price"].diff()

# Subset period most relevant for recent events
df_model = df["2012":"2022-09-30"].copy()

print("Modeling period shape:", df_model.shape)
print(df_model.head())
print(df_model.tail())

# Save processed file
df_model.to_csv("../data/processed/brent_2012_2022.csv")
print("\nSaved → ../data/processed/brent_2012_2022.csv")

# Also save a version with time index starting from 0 (useful for PyMC tau)
df_model_reset = df_model.reset_index()
df_model_reset["t"] = np.arange(len(df_model_reset))
df_model_reset.to_csv("../data/processed/brent_model_ready.csv", index=False)
print("Saved indexed version → ../data/processed/brent_model_ready.csv")