In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("Our415_Cleaned_20251008.csv", low_memory=False)

# ---- helpers ----
def to_float(x):
    if pd.isna(x): return np.nan
    s = str(x)
    s = "".join(ch for ch in s if (ch.isdigit() or ch in ".-"))
    try: return float(s)
    except: return np.nan

def iqr_bounds(s, mult=1.5):
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    return q1 - mult*iqr, q3 + mult*iqr

def mad_zscore(s):
    m = s.median()
    mad = (s - m).abs().median()
    if mad == 0: 
        return pd.Series([0]*len(s), index=s.index)
    return 0.6745*(s - m).abs()/mad

# ---- prepare numeric features ----
if "duration_min" in df:
    df["duration_min"] = pd.to_numeric(df["duration_min"], errors="coerce")

price_col = "admission_price" if "admission_price" in df.columns else None
if price_col:
    df["admission_price_num"] = df[price_col].apply(to_float)

# ---- recompute duration from dates when possible ----
for c in ["event_start_date","event_end_date"]:
    if c in df: df[c] = pd.to_datetime(df[c], errors="coerce")

if {"event_start_date","event_end_date"}.issubset(df.columns):
    dur2 = (df["event_end_date"] - df["event_start_date"]).dt.total_seconds()/60
    df["duration_from_dates"] = dur2
    # if original missing/invalid but dates ok, fill it
    df.loc[df["duration_min"].isna() & dur2.notna() & (dur2 >= 0), "duration_min"] = dur2

# ---- rule-based flags ----
# Dates/order
df["is_outlier_dates"] = False
if {"event_start_date","event_end_date"}.issubset(df.columns):
    df.loc[df["event_end_date"] < df["event_start_date"], "is_outlier_dates"] = True

# Duration
df["is_outlier_duration_rule"] = (df["duration_min"] < 0) | (df["duration_min"] == 0) | (df["duration_min"] > 12*60)

# Duration IQR/MAD (robust)
if df["duration_min"].notna().any():
    s = df["duration_min"].dropna()
    lo_iqr, hi_iqr = iqr_bounds(s, mult=1.5)
    z_mad = mad_zscore(s)
    df["is_outlier_duration_iqr"] = False
    df.loc[df["duration_min"] < lo_iqr, "is_outlier_duration_iqr"] = True
    df.loc[df["duration_min"] > hi_iqr, "is_outlier_duration_iqr"] = True
    df["is_outlier_duration_mad"] = False
    df.loc[s.index[z_mad > 3], "is_outlier_duration_mad"] = True
else:
    df["is_outlier_duration_iqr"] = False
    df["is_outlier_duration_mad"] = False

# Price
if price_col:
    s = df["admission_price_num"]
    lo_iqr_p, hi_iqr_p = iqr_bounds(s.dropna(), mult=1.5) if s.notna().sum() else (np.nan, np.nan)
    df["is_outlier_price_rule"] = (s < 0) | (s > s.dropna().quantile(0.99))
    df["is_outlier_price_iqr"]  = False if np.isnan(lo_iqr_p) else ((s < lo_iqr_p) | (s > hi_iqr_p))

# Geo box (SF rough bbox)
lat_ok = df["latitude"].astype(float).between(37.70, 37.84, inclusive="both") if "latitude" in df else pd.Series(False, index=df.index)
lon_ok = df["longitude"].astype(float).between(-122.52, -122.35, inclusive="both") if "longitude" in df else pd.Series(False, index=df.index)
df["is_outlier_geo"] = False
if "latitude" in df and "longitude" in df:
    df.loc[~(lat_ok & lon_ok), "is_outlier_geo"] = True

# ---- optional: winsorize features you will model on (do NOT overwrite raw columns) ----
df["duration_min_w"] = df["duration_min"]
if df["duration_min"].notna().any():
    lo, hi = iqr_bounds(df["duration_min"].dropna(), mult=1.5)
    df["duration_min_w"] = df["duration_min"].clip(lower=lo, upper=hi)

if price_col:
    df["admission_price_w"] = df["admission_price_num"]
    if df["admission_price_num"].notna().any():
        lo, hi = iqr_bounds(df["admission_price_num"].dropna(), mult=1.5)
        df["admission_price_w"] = df["admission_price_num"].clip(lower=lo, upper=hi)

# ---- quick report ----
flags = [c for c in df.columns if c.startswith("is_outlier_")]
print(df[flags].sum().sort_values(ascending=False))

df.to_csv("Our415_Cleaned_20251008_with_outlier_flags.csv", index=False)
print("Saved: Our415_Cleaned_20251008_with_outlier_flags.csv")


  if c in df: df[c] = pd.to_datetime(df[c], errors="coerce")
  if c in df: df[c] = pd.to_datetime(df[c], errors="coerce")


KeyError: 'duration_min'