In [6]:
# === Our415: Date & Time Cleaning + Calendar Export ===
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

# --- Load dataset directly ---
csv_path = "Our415_Events_and_Activities_20251008.csv"
df = pd.read_csv(csv_path, low_memory=False)
print(f"âœ… Loaded {df.shape[0]} rows Ã— {df.shape[1]} columns")

# --- Parse each safely (non-destructive) ---
from dateutil import parser

def safe_parse(x):
    try:
        return parser.parse(str(x))
    except Exception:
        return pd.NaT

df["start_parsed"] = df["start_date"].apply(safe_parse)

for c in date_cols:
    df[f"{c}_parsed"] = pd.to_datetime(df[c], errors="coerce")

# --- Derive core calendar fields from start date ---
start_col = next((c for c in df.columns if "start" in c.lower() and "parsed" in c.lower()), None)
if start_col is None:
    start_col = next((c for c in df.columns if "date" in c.lower() and "parsed" in c.lower()), None)

if start_col is None:
    raise ValueError("Couldn't detect a valid start date column automatically. Check your column names.")

df["event_year"] = df[start_col].dt.year
df["event_month"] = df[start_col].dt.month_name()
df["event_day"] = df[start_col].dt.day
df["event_weekday"] = df[start_col].dt.day_name()
df["event_hour"] = df[start_col].dt.hour
df["event_date"] = df[start_col].dt.date
df["is_weekend"] = df["event_weekday"].isin(["Saturday", "Sunday"])

# --- Optional: event duration if end column exists ---
end_col = next((c for c in df.columns if "end" in c.lower() and "parsed" in c.lower()), None)
if end_col:
    df["event_duration_days"] = (df[end_col] - df[start_col]).dt.days

# --- Build grouped calendar summaries ---
daily_counts = (
    df.dropna(subset=["event_date"])
      .groupby("event_date")
      .size()
      .reset_index(name="event_count")
      .sort_values("event_date")
)
weekday_hour = (
    df.dropna(subset=["event_weekday", "event_hour"])
      .pivot_table(index="event_weekday", columns="event_hour",
                   values=start_col, aggfunc="count", fill_value=0)
      .reindex(["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"])
)

# --- Create output folder and save ---
out_dir = Path("data_calendar")
out_dir.mkdir(exist_ok=True)

clean_csv = out_dir / "our415_with_parsed_dates.csv"
daily_csv = out_dir / "daily_event_counts.csv"
matrix_csv = out_dir / "weekday_hour_matrix.csv"

df.to_csv(clean_csv, index=False)
daily_counts.to_csv(daily_csv, index=False)
weekday_hour.to_csv(matrix_csv)

print(f"\nâœ… Cleaned dataset saved to: {clean_csv}")
print(f"âœ… Daily event counts saved to: {daily_csv}")
print(f"âœ… Weekday-hour matrix saved to: {matrix_csv}")

# --- Quick line chart for validation ---
plt.figure(figsize=(12,4))
plt.plot(daily_counts["event_date"], daily_counts["event_count"])
plt.title("Our415 â€” Daily Event Counts (Post Date Cleaning)")
plt.xlabel("Date")
plt.ylabel("Event Count")
plt.tight_layout()
plt.savefig(out_dir / "daily_event_counts.png", dpi=150, bbox_inches="tight")
plt.show()

print(f"ðŸ“Š Chart saved to: {out_dir / 'daily_event_counts.png'}")

# --- Preview key outputs ---
display(df.head(5))
display(daily_counts.head(10))
display(weekday_hour.head(7))




âœ… Loaded 2039 rows Ã— 31 columns


KeyError: 'start_date'