# EDA Final — Fixed: Auto-install missing packages & robust imports

This notebook adds a bootstrap cell that attempts to install required Python packages if they are missing (works when the notebook kernel has internet and pip access). It then performs the same advanced EDA steps. Run the first cell to ensure dependencies.

In [None]:
# Bootstrap: check & install required packages (only if missing).
# This will attempt to install packages into the current Python environment.
# If you are in a restricted environment without internet or pip, skip this cell and install manually:
# pip install numpy pandas matplotlib seaborn scikit-learn umap-learn plotly

import importlib
import subprocess
import sys

required = [
    ("numpy", "numpy"),
    ("pandas", "pandas"),
    ("matplotlib", "matplotlib"),
    ("seaborn", "seaborn"),
    ("sklearn", "scikit-learn"),
    ("umap", "umap-learn"),
    ("plotly", "plotly")
]

to_install = []
for module_name, pkg_name in required:
    try:
        importlib.import_module(module_name)
    except Exception:
        to_install.append(pkg_name)

if to_install:
    print("Missing packages detected:", to_install)
    print("Attempting to install with pip. This may take a few minutes...")
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", *to_install])
        print("Installation finished. You may need to restart the kernel before re-running imports.")
    except Exception as e:
        print("Automatic installation failed:", e)
        print("Please install the missing packages manually, e.g.:")
        print("pip install " + " ".join(to_install))
else:
    print("All required packages appear to be installed.")


In [None]:
# Safe imports and warnings
import os
import sys
import warnings

warnings.filterwarnings('ignore')

try:
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from IPython.display import display
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import IsolationForest, RandomForestClassifier
    import umap
    import plotly.express as px
    import plotly.graph_objects as go
    print("All imports successful.")
except Exception as e:
    print("Import error:", e)
    print("If this mentions a missing module, run the bootstrap cell above or install manually.")


In [None]:
# Load datasets (adjust paths if necessary)
def load_df(path):
    if os.path.exists(path):
        try:
            df = pd.read_csv(path)
            print(f"Loaded {path} -> {df.shape}")
            return df
        except Exception as e:
            print("Error reading", path, e)
            return pd.DataFrame()
    else:
        print("File not found:", path)
        return pd.DataFrame()

df1 = load_df('data/processed/sapimouse_1min_features_v4.csv')
df3 = load_df('data/processed/sapimouse_3min_features_v4.csv')
df_pairs = load_df('data/processed/pairs_train_v1.csv')


## Quick descriptive checks

In [None]:
for name, df in [('1min', df1), ('3min', df3), ('pairs', df_pairs)]:
    if df is None or df.empty:
        print(f"{name}: empty or not loaded")
    else:
        print(f"{name}: shape={df.shape}, columns={len(df.columns)}")
        display(df.head(2))


## Time Series plots (if timestamp-like column found)

In [None]:
if not df1.empty:
    time_cols = [c for c in df1.columns if 'time' in c.lower() or 'timestamp' in c.lower()]
    numeric = df1.select_dtypes(include=[np.number]).columns.tolist()[:5]
    if time_cols:
        t = time_cols[0]
        df_t = df1.dropna(subset=[t]).sort_values(t).head(1000)
        for col in numeric:
            plt.figure(figsize=(8,3))
            plt.plot(df_t[t].values, df_t[col].values, lw=0.8)
            plt.title(f"{col} vs {t}")
            plt.xlabel(t); plt.ylabel(col)
            plt.tight_layout(); plt.show()
    else:
        print("No timestamp-like column detected in df1; skipping time series.")


## PCA (2D)

In [None]:
if not df1.empty:
    numeric = df1.select_dtypes(include=[np.number]).dropna()
    if numeric.shape[0] >= 10 and numeric.shape[1] >= 2:
        sample = numeric.sample(n=min(2000, numeric.shape[0]), random_state=42)
        scaler = StandardScaler()
        Xs = scaler.fit_transform(sample)
        pca = PCA(n_components=2, random_state=42)
        pcs = pca.fit_transform(Xs)
        plt.figure(figsize=(6,5))
        plt.scatter(pcs[:,0], pcs[:,1], s=6, alpha=0.7)
        plt.title("PCA 2D Projection (df1)")
        plt.xlabel("PC1"); plt.ylabel("PC2")
        plt.tight_layout(); plt.show()
    else:
        print("Not enough numeric data for PCA.")


## UMAP (2D)

In [None]:
if not df1.empty:
    try:
        numeric = df1.select_dtypes(include=[np.number]).dropna()
        sample = numeric.sample(n=min(2000, numeric.shape[0]), random_state=42)
        reducer = umap.UMAP(n_components=2, random_state=42)
        emb = reducer.fit_transform(sample)
        plt.figure(figsize=(6,5))
        plt.scatter(emb[:,0], emb[:,1], s=6, alpha=0.7)
        plt.title("UMAP 2D Embedding (df1)")
        plt.tight_layout(); plt.show()
    except Exception as e:
        print("UMAP error:", e)


## Anomaly Detection (IsolationForest)

In [None]:
if not df1.empty:
    try:
        subset = df1.select_dtypes(include=[np.number]).dropna().sample(n=min(3000, df1.shape[0]), random_state=42)
        iso = IsolationForest(contamination=0.05, random_state=42)
        iso_pred = iso.fit_predict(subset)
        # plot first two numeric dims if available
        if subset.shape[1] >= 2:
            plt.figure(figsize=(6,5))
            plt.scatter(subset.iloc[:,0], subset.iloc[:,1], c=(iso_pred==-1), cmap='coolwarm', s=6)
            plt.title("IsolationForest anomalies (red=anomaly)")
            plt.tight_layout(); plt.show()
        else:
            print("Not enough dims to plot anomalies.")
    except Exception as e:
        print("Anomaly detection error:", e)


## Feature importance (RandomForest) — using pairs dataset

In [None]:
if not df_pairs.empty and 'y' in df_pairs.columns:
    try:
        num = df_pairs.select_dtypes(include=[np.number]).dropna()
        if 'y' in num.columns:
            num = num.drop(columns=['y'])
        X = num
        y = df_pairs.loc[X.index, 'y']
        # ensure y is numeric
        y = pd.to_numeric(y, errors='coerce').fillna(0).astype(int)
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X, y)
        imp = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False).head(30)
        plt.figure(figsize=(8,6))
        imp.plot(kind='barh')
        plt.gca().invert_yaxis()
        plt.title("Top feature importances (pairs)")
        plt.tight_layout(); plt.show()
    except Exception as e:
        print("Feature importance error:", e)
else:
    print("Pairs dataset missing or no 'y' label.")


## Class separation (PCA on pairs)

In [None]:
if not df_pairs.empty and 'y' in df_pairs.columns:
    try:
        num = df_pairs.select_dtypes(include=[np.number]).dropna()
        if 'y' in num.columns:
            num = num.drop(columns=['y'])
        sample = num.sample(n=min(2000, num.shape[0]), random_state=42)
        y = df_pairs.loc[sample.index, 'y']
        scaler = StandardScaler()
        pcs = PCA(n_components=2, random_state=42).fit_transform(scaler.fit_transform(sample))
        plt.figure(figsize=(6,5))
        plt.scatter(pcs[:,0], pcs[:,1], c=y, cmap='coolwarm', s=6)
        plt.title("PCA on pair features colored by class")
        plt.tight_layout(); plt.show()
    except Exception as e:
        print("Class separation error:", e)
else:
    print("Pairs dataset missing or no 'y' label.")


## Interactive Plotly: Scatter Matrix

In [None]:
if not df1.empty:
    try:
        numcols = df1.select_dtypes(include=[np.number]).columns.tolist()[:6]
        fig = px.scatter_matrix(df1[numcols].sample(n=min(1000, df1.shape[0]), random_state=42))
        fig.update_layout(height=800, width=1000, title="Interactive Scatter Matrix (Plotly)")
        fig.show()
    except Exception as e:
        print("Plotly scatter matrix error:", e)
else:
    print("df1 empty; skipping Plotly.")


## Notes
- If you ran the bootstrap installer, **restart the kernel** after installation before re-running imports.
- If your environment lacks internet or permission to install packages, install required packages manually.