In [None]:
import os, subprocess, sys, math, warnings, gc
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
warnings.filterwarnings("ignore")

try:
    get_ipython
    %matplotlib inline
except Exception:
    pass

COLAB_DRIVE_BASE = Path("/content/drive/My Drive/capstone/Final Data")
MNT_DATA_DIR = Path("/mnt/data")
LOCAL_DATA_DIR = Path(r"C:\Users\mstat\OneDrive\Desktop\EDA\Final Data")

OUT_DIR_FALLBACK = Path.cwd() / "eda_output"
OUT_DIR_FALLBACK.mkdir(parents=True, exist_ok=True)

METRO_BASENAMES = {
    "phoenix": "final_zcta_dataset_phoenix_final_actual.csv",
    "memphis": "final_zcta_dataset_memphis_final_actual.csv",
    "los_angeles": "final_zcta_dataset_los_angeles_final_actual.csv",
    "dallas": "final_zcta_dataset_dallas_final_actual.csv",
}
METRO_NAMES = {
    "phoenix": "Phoenix-Mesa-Chandler, AZ",
    "memphis": "Memphis, TN-MS-AR",
    "los_angeles": "Los Angeles-Long Beach-Anaheim, CA",
    "dallas": "Dallas-Fort Worth-Arlington, TX"
}

NUMERIC_COLS = ["rent_to_income","long45_share","long60_share","commute_min_proxy","ttw_total","zori","stops_per_km2"]
PROFILE_SAMPLE_N = 5000
DIST_GRID_MAX = 8
PAIR_SAMPLE_CAP = 1500
SCATTER_SAMPLE_CAP = 20000
PPTX_FILENAME = "EDA_figures_with_heatmap_metrics_and_profile_and_slopes.pptx"

def pip_install(packages):
    for pkg in packages:
        try:
            __import__(pkg)
        except Exception:
            print(f"Installing {pkg} ...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

pip_install(["python-pptx", "Pillow"])
try:
    from scipy import stats
    HAVE_SCIPY = True
except Exception:
    HAVE_SCIPY = False

try:
    import statsmodels.api as sm
    from statsmodels.formula.api import ols
    HAVE_STATSMODELS = True
except Exception:
    HAVE_STATSMODELS = False

try:
    from ydata_profiling import ProfileReport
    HAVE_PROFILE = True
except Exception:
    HAVE_PROFILE = False

try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
except Exception:
    pass

if COLAB_DRIVE_BASE.exists():
    OUT_DIR = COLAB_DRIVE_BASE
    print("Using Google Drive OUT_DIR:", OUT_DIR)
else:
    if LOCAL_DATA_DIR.exists():
        OUT_DIR = Path(str(LOCAL_DATA_DIR.parent)) / "eda_output"
        OUT_DIR.mkdir(parents=True, exist_ok=True)
        print("Using local OUT_DIR:", OUT_DIR)
    else:
        OUT_DIR = OUT_DIR_FALLBACK
        print("Using fallback OUT_DIR:", OUT_DIR)

FIG_DIR = OUT_DIR / "figures_with_metrics"
FIG_DIR.mkdir(parents=True, exist_ok=True)

def light_clean(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "period" in df.columns:
        df["period"] = pd.to_datetime(df["period"], errors="coerce")
    for col in NUMERIC_COLS:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    return df

def find_file(fname):
    for p in [COLAB_DRIVE_BASE / fname, MNT_DATA_DIR / fname, LOCAL_DATA_DIR / fname, Path(fname)]:
        if p.exists():
            return p
    return None

def overlay_text_on_ax(ax, lines, loc='upper right', fontsize=9, bbox=dict(facecolor='white', alpha=0.85, edgecolor='none')):
    if loc == 'upper right':
        x, y, ha, va = 0.98, 0.98, 'right', 'top'
    elif loc == 'upper left':
        x, y, ha, va = 0.02, 0.98, 'left', 'top'
    else:
        x, y, ha, va = 0.98, 0.98, 'right', 'top'
    txt = "\n".join(lines)
    ax.text(x, y, txt, transform=ax.transAxes, ha=ha, va=va, fontsize=fontsize, bbox=bbox, family='monospace')

def save_fig_with_metrics(fig, ax, fname, metrics_lines=None, metrics_loc='upper right', embed_metrics=True):
    if embed_metrics and metrics_lines:
        overlay_text_on_ax(ax, metrics_lines, loc=metrics_loc)
    fig.tight_layout()
    outp = FIG_DIR / fname
    fig.savefig(outp, dpi=150, bbox_inches='tight')
    plt.close(fig)
    gc.collect()
    return outp

def summary(series):
    s = series.dropna()
    return {
        "n": int(s.count()),
        "mean": float(s.mean()) if len(s)>0 else np.nan,
        "median": float(s.median()) if len(s)>0 else np.nan,
        "std": float(s.std()) if len(s)>1 else np.nan,
        "missing_pct": float(100*(len(series)-len(s))/len(series)) if len(series)>0 else np.nan
    }

frames = []
used = {}
missing = []
for key, fname in METRO_BASENAMES.items():
    p = find_file(fname)
    if p is None:
        missing.append((key, fname))
        continue
    print(f"Loading {METRO_NAMES.get(key,key)} from: {p}")
    tmp = pd.read_csv(p, low_memory=False)
    tmp = light_clean(tmp)
    tmp["metro_key"] = key
    tmp["metro_name"] = METRO_NAMES.get(key, key)
    frames.append(tmp)
    used[key] = str(p)

if not frames:
    raise SystemExit("No files found. Verify filenames and mount Drive if using Colab.")

df = pd.concat(frames, ignore_index=True)
print("Combined df shape:", df.shape)
for k,v in used.items():
    print(" -", k, v)
if missing:
    print("Missing files (not found):", missing)

combined_csv = OUT_DIR / "combined_metros_clean.csv"
df.to_csv(combined_csv, index=False)
diag = {}
for c in ['period'] + [c for c in NUMERIC_COLS if c in df.columns] + ['metro_key','metro_name']:
    if c in df.columns:
        diag[c] = {"nulls": int(df[c].isna().sum()), "non_nulls": int(df[c].notna().sum()), "dtype": str(df[c].dtype)}
pd.DataFrame.from_dict(diag, orient='index').to_csv(OUT_DIR / "combined_metros_diag.csv")
print("Saved combined CSV and diagnostics to:", OUT_DIR)

saved_images = []

profile_sample = df.sample(n=min(len(df), PROFILE_SAMPLE_N), random_state=1).copy()
profile_numeric = [c for c in NUMERIC_COLS if c in profile_sample.columns]

if HAVE_PROFILE:
    try:
        profile = ProfileReport(profile_sample, title="Affordability + Commute (sample)", minimal=True)
        prof_html = OUT_DIR / "Affordability_EDA_Profile_sampled.html"
        profile.to_file(prof_html)
        saved_images.append((prof_html, "Profile report (sampled HTML)", {"note":"open HTML in Drive or download"}, False))
        del profile
        gc.collect()
        print("Saved sampled profile HTML:", prof_html)
    except Exception as e:
        print("ydata_profiling failed:", e)
else:
    print("ydata_profiling not installed — skipped HTML profile. To enable: pip install ydata-profiling")

num_for_dist = profile_numeric[:DIST_GRID_MAX]
if num_for_dist:
    ncols = 2
    nrows = math.ceil(len(num_for_dist)/ncols)
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(9, 3*nrows))
    axes = axes.flatten()
    for i, col in enumerate(num_for_dist):
        ax = axes[i]
        sns.histplot(profile_sample[col].dropna(), bins=40, kde=True, ax=ax)
        ax.set_title(col)
        s = profile_sample[col].dropna()
        stats_lines = [f"n={s.count():,}", f"mean={s.mean():.3f}" if s.size>0 else "mean=n/a", f"std={s.std():.3f}" if s.size>1 else "std=n/a"]
        overlay_text_on_ax(ax, stats_lines, loc='upper right', fontsize=8)
    for j in range(i+1, len(axes)):
        axes[j].axis('off')
    dist_grid_path = FIG_DIR / "profile_distributions_grid.png"
    fig.tight_layout(); fig.savefig(dist_grid_path, dpi=150, bbox_inches='tight'); plt.close(fig); gc.collect()
    saved_images.append((dist_grid_path, "Profile — distributions (sample)", {"n_vars": len(num_for_dist), "sample_n": len(profile_sample)}, True))

var_rows = []
for col in profile_sample.columns:
    dtype = str(profile_sample[col].dtype)
    n_unique = int(profile_sample[col].nunique(dropna=True))
    missing = int(profile_sample[col].isna().sum())
    missing_pct = 100 * missing / len(profile_sample) if len(profile_sample)>0 else np.nan
    row = {"variable": col, "dtype": dtype, "n_unique": n_unique, "missing_pct": missing_pct}
    if col in profile_numeric:
        s = profile_sample[col].dropna()
        row["mean"] = float(s.mean()) if len(s)>0 else np.nan
        row["std"] = float(s.std()) if len(s)>1 else np.nan
    var_rows.append(row)
vars_df = pd.DataFrame(var_rows)
if not vars_df.empty:
    max_rows = 50
    display_df = vars_df if len(vars_df) <= max_rows else vars_df.head(max_rows)
    fig, ax = plt.subplots(figsize=(10, 0.5 + 0.25*len(display_df)))
    ax.axis('off')
    col_labels = display_df.columns.tolist()
    cell_text = display_df.values.tolist()
    table = ax.table(cellText=cell_text, colLabels=col_labels, loc='center')
    table.auto_set_font_size(False); table.set_fontsize(8); table.scale(1, 1.1)
    ax.set_title("Variables overview (sample) — showing up to 50 rows", fontsize=10)
    vars_table_path = FIG_DIR / "profile_variables_overview.png"
    fig.tight_layout(); fig.savefig(vars_table_path, dpi=150, bbox_inches='tight'); plt.close(fig); gc.collect()
    saved_images.append((vars_table_path, "Profile — variables overview (sample)", {"n_vars": len(vars_df), "sample_n": len(profile_sample)}, True))

miss_df = profile_sample.isna().astype(int)
fig, ax = plt.subplots(figsize=(10, max(2, 0.12*miss_df.shape[1])))
sns.heatmap(miss_df.T, cbar=False, cmap="Blues", ax=ax)
ax.set_title("Missingness (sample) — rows x vars (1 = missing)")
miss_path = FIG_DIR / "profile_missingness_heatmap_sample.png"
fig.tight_layout(); fig.savefig(miss_path, dpi=150, bbox_inches='tight'); plt.close(fig); gc.collect()
saved_images.append((miss_path, "Missingness heatmap (sample)", {"sample_n": len(profile_sample)}, True))

sns.set(style="whitegrid", context="notebook")

if 'rent_to_income' in df.columns:
    fig, ax = plt.subplots(figsize=(10,4))
    sns.histplot(df['rent_to_income'].dropna(), bins=60, ax=ax)
    ax.set_title("Distribution: rent_to_income")
    metrics = summary(df['rent_to_income'])
    mlines = [
        f"n = {metrics['n']:,}",
        f"mean = {metrics['mean']:.4f}",
        f"median = {metrics['median']:.4f}",
        f"std = {metrics['std']:.4f}",
        f"% missing = {metrics['missing_pct']:.1f}%"
    ]
    path = save_fig_with_metrics(fig, ax, "rent_to_income_dist_with_metrics.png", metrics_lines=mlines, embed_metrics=True)
    saved_images.append((path, "Distribution — rent_to_income", metrics, True))

per_metro_slopes = []
slope_csv_path = FIG_DIR / "per_metro_slopes.csv"
if {'commute_min_proxy','rent_to_income'}.issubset(df.columns):
    samp = df.sample(n=min(len(df), SCATTER_SAMPLE_CAP), random_state=0).copy()
    samp['commute_min_proxy'] = pd.to_numeric(samp['commute_min_proxy'], errors='coerce')
    samp['rent_to_income'] = pd.to_numeric(samp['rent_to_income'], errors='coerce')
    samp = samp.dropna(subset=['commute_min_proxy','rent_to_income'])
    if not samp.empty:
        fig, ax = plt.subplots(figsize=(9,6))
        hue = 'metro_key' if 'metro_key' in samp.columns else ('metro_name' if 'metro_name' in samp.columns else None)
        try:
            if hue:
                sns.scatterplot(data=samp, x='commute_min_proxy', y='rent_to_income', hue=hue, alpha=0.35, s=18, ax=ax, legend=False)
            else:
                sns.scatterplot(data=samp, x='commute_min_proxy', y='rent_to_income', alpha=0.35, s=18, ax=ax, legend=False)
        except Exception:
            sns.scatterplot(data=samp, x='commute_min_proxy', y='rent_to_income', alpha=0.35, s=18, ax=ax, legend=False)

        ax.set_title("commute_min_proxy vs rent_to_income (sample) — per-metro slopes")

        prop_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
        metros = samp[hue].dropna().unique() if hue else np.array(['all'])
        slope_lines = []

        for i, metro in enumerate(metros):
            try:
                if hue:
                    mdf = samp[samp[hue] == metro][['commute_min_proxy','rent_to_income']].dropna()
                else:
                    mdf = samp[['commute_min_proxy','rent_to_income']].dropna()
                if len(mdf) < 6:
                    continue
                x = mdf['commute_min_proxy'].values.astype(float)
                y = mdf['rent_to_income'].values.astype(float)
                slope, intercept = np.polyfit(x, y, 1)

                r = np.corrcoef(x, y)[0,1] if len(x) > 1 else np.nan
                r2 = float(r**2) if (not np.isnan(r)) else np.nan
                x_min, x_max = np.nanmin(x), np.nanmax(x)
                x_vals = np.linspace(x_min, x_max, 200)
                y_vals = slope * x_vals + intercept
                color = prop_cycle[i % len(prop_cycle)]

                ax.plot(x_vals, y_vals, linewidth=2.0, linestyle='--', color=color, label=None)
                slope_lines.append({
                    "metro": str(metro),
                    "slope": float(slope),
                    "intercept": float(intercept),
                    "r2": None if np.isnan(r2) else float(r2),
                    "n": int(len(mdf))
                })
            except Exception:
                continue

        metrics_lines = [f"Sample n = {len(samp):,}", ""]
        for j, sdict in enumerate(slope_lines[:12]):
            metrics_lines.append(f"{sdict['metro']}: slope={sdict['slope']:.4f}, R²={sdict['r2'] if sdict['r2'] is not None else 'n/a'} (n={sdict['n']})")
        if len(slope_lines) > 12:
            metrics_lines.append(f"... +{len(slope_lines)-12} more groups")
        overlay_text_on_ax(ax, metrics_lines, loc='upper left', fontsize=8)


        scatter_path = save_fig_with_metrics(fig, ax, "commute_vs_rent_scatter_with_per_metro_slopes.png", metrics_lines=None, embed_metrics=False)
        saved_images.append((scatter_path, "Scatter — commute_min_proxy vs rent_to_income (with per-metro slopes)", {"per_metro_count": len(slope_lines)}, True))


        if slope_lines:
            pd.DataFrame(slope_lines).to_csv(slope_csv_path, index=False)
            print("Saved per-metro slopes CSV:", slope_csv_path)
            saved_images.append((slope_csv_path, "Per-metro slopes (CSV)", {"rows": len(slope_lines)}, False))

numeric_cols = [c for c in NUMERIC_COLS if c in df.columns]
heatmap_metrics = {}
if numeric_cols:
    fig, ax = plt.subplots(figsize=(7,6))
    corr = df[numeric_cols].corr()
    sns.heatmap(corr, annot=True, fmt=".2f", linewidths=0.3, square=True, cbar_kws={"shrink":0.6}, ax=ax)
    ax.set_title("Correlation matrix (numeric columns)")
    m = corr.abs().values
    ncols = len(numeric_cols)
    if ncols>1:
        sum_off = m.sum() - np.trace(m)
        avg_off = sum_off / (ncols*(ncols-1))
    else:
        avg_off = np.nan
    heatmap_metrics = {"n_numeric_cols": ncols, "avg_abs_corr_offdiag": float(avg_off) if not np.isnan(avg_off) else None}
    heatmap_path = save_fig_with_metrics(fig, ax, "numeric_corr_heatmap_no_metrics.png", metrics_lines=None, embed_metrics=False)
    saved_images.append((heatmap_path, "Correlation matrix — numeric columns", heatmap_metrics, False))

if 'rent_to_income' in df.columns and 'metro_key' in df.columns:
    fig, ax = plt.subplots(figsize=(9,5))
    sns.boxplot(data=df, x='metro_key', y='rent_to_income', ax=ax)
    ax.set_title("rent_to_income by metro")
    per = df.groupby('metro_key')['rent_to_income'].agg(['count','mean']).to_dict(orient='index')
    per_lines = [f"{k}: n={v['count']:,}, mean={v['mean']:.4f}" for k,v in per.items()]
    overall = summary(df['rent_to_income'])
    mlines = [f"overall n={overall['n']:,}, mean={overall['mean']:.4f}", ""] + per_lines
    path = save_fig_with_metrics(fig, ax, "rent_by_metro_boxplot_with_metrics.png", metrics_lines=mlines, embed_metrics=True)
    saved_images.append((path, "Boxplot — rent_to_income by metro", {"overall": overall, "per_metro": per}, True))

for metro in sorted(df['metro_key'].unique()):
    subset = df[df['metro_key'] == metro]
    if subset.empty or not numeric_cols:
        continue
    plot_df = subset[numeric_cols].dropna(how="all")
    if plot_df.empty:
        continue
    sampled_flag = False
    if len(plot_df) > PAIR_SAMPLE_CAP:
        plot_df = plot_df.sample(n=PAIR_SAMPLE_CAP, random_state=0)
        sampled_flag = True
    fname = f"pairplot_{metro}_with_metrics.png"
    try:
        g = sns.pairplot(plot_df, diag_kind="kde", corner=True, plot_kws={"alpha":0.6, "s":20})
        g.fig.suptitle(f"Pairplot — {metro}", y=1.02)
        g.fig.text(0.98, 0.98, f"n = {len(plot_df):,}\nsampled = {sampled_flag}", ha='right', va='top', fontsize=9, bbox=dict(facecolor='white', alpha=0.85))
        g.fig.set_size_inches(10,10)
        outp = FIG_DIR / fname
        g.fig.savefig(outp, dpi=150, bbox_inches='tight')
        plt.close(g.fig); gc.collect()
        saved_images.append((outp, f"Pairplot — {metro}", {"n_sampled": len(plot_df), "sampled": sampled_flag}, True))
    except Exception as e:
        print("Pairplot failed for", metro, e)

if HAVE_STATSMODELS and set(['rent_to_income','commute_min_proxy']).issubset(df.columns):
    model_df = df[['rent_to_income','commute_min_proxy','stops_per_km2','zori']].copy()
    model_df = model_df.dropna()
    if len(model_df) >= 10:
        formula = "rent_to_income ~ commute_min_proxy"
        if 'stops_per_km2' in model_df.columns:
            formula += " + stops_per_km2"
        if 'zori' in model_df.columns:
            formula += " + zori"
        res = ols(formula, data=model_df).fit()
        with open(OUT_DIR / "ols_summary.txt", "w") as f:
            f.write(str(res.summary()))
        model_metrics = {"n": int(res.nobs), "r_squared": float(res.rsquared), "adj_r_squared": float(res.rsquared_adj),
                         "f_stat": float(res.fvalue) if res.fvalue is not None else None,
                         "f_pvalue": float(res.f_pvalue) if hasattr(res, "f_pvalue") else None}
        coefs = pd.DataFrame({
            "coef": res.params,
            "std_err": res.bse,
            "t": res.tvalues,
            "p_value": res.pvalues,
            "ci_lower": res.conf_int().iloc[:,0],
            "ci_upper": res.conf_int().iloc[:,1]
        }).reset_index().rename(columns={"index": "term"})
        display_cols = ["term","coef","std_err","t","p_value","ci_lower","ci_upper"]
        cell_text = []
        for _, row in coefs.iterrows():
            row_cells = []
            for col in display_cols:
                val = row[col]
                if col == "term":
                    row_cells.append(str(val))
                else:
                    try:
                        fv = float(val)
                        if (abs(fv) >= 1e4) or (abs(fv) > 0 and abs(fv) < 1e-3):
                            row_cells.append(f"{fv:.3g}")
                        else:
                            row_cells.append(f"{fv:.4f}")
                    except Exception:
                        row_cells.append(str(val))
            row_cells = row_cells
            cell_text.append(row_cells)
        fig, ax = plt.subplots(figsize=(10, 0.7 + 0.35*len(cell_text)))
        ax.axis('off')
        col_labels = ["term","coef","std_err","t","p_value","ci_lower","ci_upper"]
        table = ax.table(cellText=cell_text, colLabels=col_labels, loc='center', cellLoc='center')
        table.auto_set_font_size(False); table.set_fontsize(9); table.scale(1, 1.2)
        ax.set_title("Regression coefficients (OLS): " + formula, fontsize=10)
        ax.text(0.01, 0.95, f"n={model_metrics['n']}, R²={model_metrics['r_squared']:.4f}, adj R²={model_metrics['adj_r_squared']:.4f}", transform=ax.transAxes, fontsize=9, va='top')
        coef_table_path = FIG_DIR / "regression_coefficients_table.png"
        fig.tight_layout(); fig.savefig(coef_table_path, dpi=150, bbox_inches='tight'); plt.close(fig); gc.collect()
        saved_images.append((coef_table_path, "Regression coefficients (OLS)", {"model_metrics": model_metrics}, True))

        fig, ax = plt.subplots(figsize=(7,5))
        sns.scatterplot(x=res.fittedvalues, y=res.resid, alpha=0.6, s=20, ax=ax)
        ax.axhline(0, color='red', linestyle='--', linewidth=1)
        ax.set_xlabel("Fitted"); ax.set_ylabel("Residuals"); ax.set_title("Residuals vs Fitted")
        mlines = [f"R²={model_metrics['r_squared']:.4f}", f"adj R²={model_metrics['adj_r_squared']:.4f}", f"F={model_metrics['f_stat']:.2f}", f"p(F)={model_metrics.get('f_pvalue')}"]
        path = save_fig_with_metrics(fig, ax, "reg_fitted_vs_resid_with_metrics.png", metrics_lines=mlines, embed_metrics=True)
        saved_images.append((path, "Regression diag — Resid vs Fitted", model_metrics, True))
        fig = sm.qqplot(res.resid, line="45", fit=True)
        fig.suptitle("QQ plot (residuals)", y=0.98)
        fig.text(0.98, 0.98, "\n".join([f"R²={model_metrics['r_squared']:.4f}", f"adj R²={model_metrics['adj_r_squared']:.4f}"]), ha='right', va='top', fontsize=9, bbox=dict(facecolor='white', alpha=0.85))
        qq_path = FIG_DIR / "reg_qq_with_metrics.png"
        fig.savefig(qq_path, dpi=150, bbox_inches='tight'); plt.close(fig); gc.collect()
        saved_images.append((qq_path, "Regression diag — QQ plot", model_metrics, True))
        fig, ax = plt.subplots(figsize=(8,4))
        sns.histplot(res.resid, bins=40, kde=True, ax=ax)
        ax.set_title("Histogram of residuals")
        path = save_fig_with_metrics(fig, ax, "reg_resid_hist_with_metrics.png", metrics_lines=mlines, embed_metrics=True)
        saved_images.append((path, "Regression diag — Resid histogram", model_metrics, True))
    else:
        print("Too few rows after dropping NAs to fit regression (need >=10).")
else:
    if not HAVE_STATSMODELS:
        print("statsmodels not available — skipped regression outputs.")

from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.enum.text import PP_PARAGRAPH_ALIGNMENT

prs = Presentation()

blank_layout = None
for i, layout in enumerate(prs.slide_layouts):
    if "Blank" in getattr(layout, "name", ""):
        blank_layout = layout
        break
if blank_layout is None:
    blank_layout = prs.slide_layouts[6] if len(prs.slide_layouts) > 6 else prs.slide_layouts[0]

IMG_WIDTH = Inches(9)

for img_path, title, metrics, embed_flag in saved_images:
    slide = prs.slides.add_slide(blank_layout)

    title_box = slide.shapes.add_textbox(Inches(0.4), Inches(0.15), Inches(6.5), Inches(0.5))
    tf = title_box.text_frame
    tf.text = title
    tf.paragraphs[0].font.size = Pt(18)

    if not embed_flag and metrics:
        if isinstance(metrics, dict):
            lines = []
            for k, v in metrics.items():
                if isinstance(v, dict):
                    small = "; ".join([f"{kk}:{(round(vv,3) if isinstance(vv,(int,float)) else vv)}" for kk,vv in v.items()])
                    lines.append(f"{k}: {small}")
                else:
                    if isinstance(v, float):
                        lines.append(f"{k}: {round(v,4)}")
                    else:
                        lines.append(f"{k}: {v}")
        else:
            lines = [str(metrics)]
        metr_left = Inches(6.9); metr_top = Inches(0.15); metr_w = Inches(2.6); metr_h = Inches(1.0)
        mbox = slide.shapes.add_textbox(metr_left, metr_top, metr_w, metr_h)
        mtf = mbox.text_frame; mtf.clear()
        for i, ln in enumerate(lines):
            p = mtf.add_paragraph() if i>0 else mtf.paragraphs[0]
            p.text = ln; p.font.size = Pt(9); p.alignment = PP_PARAGRAPH_ALIGNMENT.LEFT

    try:
        if str(img_path).lower().endswith(".html"):
            note_box = slide.shapes.add_textbox(Inches(0.4), Inches(0.9), Inches(9), Inches(0.9))
            note_box.text_frame.text = f"Profile HTML saved to: {img_path.name} (open from Drive)"
        elif str(img_path).lower().endswith(".csv"):
            note_box = slide.shapes.add_textbox(Inches(0.4), Inches(0.9), Inches(9), Inches(0.9))
            note_box.text_frame.text = f"CSV saved to: {img_path.name} (open from Drive)"
        else:
            slide.shapes.add_picture(str(img_path), Inches(0.4), Inches(0.9), width=IMG_WIDTH)
    except Exception as e:
        slide.shapes.add_textbox(Inches(0.4), Inches(0.9), Inches(9), Inches(0.9)).text = f"[Failed to embed file: {img_path}]"

    notes_left = Inches(0.4); notes_top = Inches(5.6); notes_w = Inches(9)
    obs = slide.shapes.add_textbox(notes_left, notes_top, notes_w, Inches(0.45)); obs.text_frame.text = "Observation: "
    obs.text_frame.paragraphs[0].font.size = Pt(12)
    sig = slide.shapes.add_textbox(notes_left, notes_top + Inches(0.5), notes_w, Inches(0.45)); sig.text_frame.text = "Significance: "
    sig.text_frame.paragraphs[0].font.size = Pt(12)
    nxt = slide.shapes.add_textbox(notes_left, notes_top + Inches(1.0), notes_w, Inches(0.45)); nxt.text_frame.text = "Suggested Next Steps: "
    nxt.text_frame.paragraphs[0].font.size = Pt(12)

pptx_path = OUT_DIR / PPTX_FILENAME
prs.save(str(pptx_path))
print(f"Saved PPTX ({len(saved_images)} items) to: {pptx_path}")
print("PNG/CSV/HTML files saved to:", FIG_DIR)
if not saved_images:
    print("No images were created — check df and columns.")


Installing python-pptx ...
Installing Pillow ...
Mounted at /content/drive
Using Google Drive OUT_DIR: /content/drive/My Drive/capstone/Final Data
Loading Phoenix-Mesa-Chandler, AZ from: /content/drive/My Drive/capstone/Final Data/final_zcta_dataset_phoenix_final_actual.csv
Loading Memphis, TN-MS-AR from: /content/drive/My Drive/capstone/Final Data/final_zcta_dataset_memphis_final_actual.csv
Loading Los Angeles-Long Beach-Anaheim, CA from: /content/drive/My Drive/capstone/Final Data/final_zcta_dataset_los_angeles_final_actual.csv
Loading Dallas-Fort Worth-Arlington, TX from: /content/drive/My Drive/capstone/Final Data/final_zcta_dataset_dallas_final_actual.csv
Combined df shape: (670, 31)
 - phoenix /content/drive/My Drive/capstone/Final Data/final_zcta_dataset_phoenix_final_actual.csv
 - memphis /content/drive/My Drive/capstone/Final Data/final_zcta_dataset_memphis_final_actual.csv
 - los_angeles /content/drive/My Drive/capstone/Final Data/final_zcta_dataset_los_angeles_final_actual.c

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/31 [00:00<?, ?it/s][A
 35%|███▌      | 11/31 [00:00<00:00, 103.42it/s][A
100%|██████████| 31/31 [00:00<00:00, 96.59it/s] 


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved sampled profile HTML: /content/drive/My Drive/capstone/Final Data/Affordability_EDA_Profile_sampled.html
Saved per-metro slopes CSV: /content/drive/My Drive/capstone/Final Data/figures_with_metrics/per_metro_slopes.csv
Saved PPTX (17 items) to: /content/drive/My Drive/capstone/Final Data/EDA_figures_with_heatmap_metrics_and_profile_and_slopes.pptx
PNG/CSV/HTML files saved to: /content/drive/My Drive/capstone/Final Data/figures_with_metrics
