<a href="https://colab.research.google.com/github/Vyshnavi2k5/2k25/blob/main/copy_of_unsupervised_algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Unsupervised Learning Suite for Healthcare Dataset
# Colab/Jupyter ready. Copy-paste entire cell and run.
# Produces outputs in "./unsupervised_reports/" by default.

import os
import zipfile
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor, NearestNeighbors
from sklearn.svm import OneClassSVM
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Optional libraries (UMAP, mlxtend) - install if missing (Colab)
try:
    import umap
except Exception:
    try:
        !pip install --quiet umap-learn
        import umap
    except Exception:
        umap = None

try:
    from mlxtend.frequent_patterns import apriori, association_rules
except Exception:
    try:
        !pip install --quiet mlxtend
        from mlxtend.frequent_patterns import apriori, association_rules
    except Exception:
        apriori = None
        association_rules = None

# ---------------- CONFIG ----------------
RANDOM_STATE = 42
USE_SAMPLE = True          # Set False to run on full dataset (can be slow)
SAMPLE_SIZE = 4000         # sample size if USE_SAMPLE=True
TOP_K_CAT = 50             # keep top-K categories for each categorical column, rest -> 'Other'
OUTPUT_DIR = "unsupervised_reports"
SEARCH_PATHS = [".", "/content", "/mnt/data"]  # where to look for uploaded file
# ----------------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------- Helpers ----------
def find_input_file():
    """Search for a csv or zip with 'health' or fallback to any csv."""
    cand = []
    for root in SEARCH_PATHS:
        for p in Path(root).rglob("*"):
            if p.is_file():
                name = p.name.lower()
                if ("health" in name or "healthcare" in name) and (name.endswith(".zip") or name.endswith(".csv")):
                    cand.append(str(p))
    if not cand:
        # fallback: any csv
        for root in SEARCH_PATHS:
            for p in Path(root).rglob("*.csv"):
                cand.append(str(p))
    return cand[0] if cand else None

def safe_ohe(**kwargs):
    """Return OneHotEncoder that works across sklearn versions."""
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False, **kwargs)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False, **kwargs)

def reduce_cardinality(df, col, top_k=TOP_K_CAT):
    # Convert to string before value_counts to handle mixed types
    df[col] = df[col].astype(str)
    top = df[col].value_counts().nlargest(top_k).index
    df[col] = df[col].where(df[col].isin(top), other="Other")
    return df

def save_scatter(X2, labels, title, fname):
    plt.figure(figsize=(7,5))
    labs = np.asarray(labels)
    unique = np.unique(labs)
    cmap = plt.cm.get_cmap("tab10", max(3, len(unique)))
    for i, u in enumerate(unique):
        mask = (labs == u)
        plt.scatter(X2[mask,0], X2[mask,1], s=12, alpha=0.7, label=str(u))
    plt.legend(bbox_to_anchor=(1.05,1), loc='upper left', fontsize='small', markerscale=2)
    plt.title(title)
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, fname), dpi=150)
    plt.close()

# ---------- Load dataset ----------
input_file = find_input_file()
if input_file is None:
    raise FileNotFoundError("No input CSV/ZIP found. Upload 'Healthcare dataset.zip' or 'healthcare_dataset.csv' to Colab/Jupyter.")
print("Input file:", input_file)

if input_file.lower().endswith(".zip"):
    with zipfile.ZipFile(input_file, "r") as z:
        extract_dir = "/tmp/healthcare_extract"
        os.makedirs(extract_dir, exist_ok=True)
        z.extractall(extract_dir)
    csvs = list(Path(extract_dir).rglob("*.csv"))
    if not csvs:
        raise FileNotFoundError("No CSV found inside the ZIP.")
    csv_path = str(csvs[0])
else:
    csv_path = input_file

print("Loading CSV:", csv_path)
df = pd.read_csv(csv_path, low_memory=False)
print("Rows loaded:", len(df))

# ---------- Optional sampling (speed) ----------
if USE_SAMPLE and len(df) > SAMPLE_SIZE:
    df = df.sample(SAMPLE_SIZE, random_state=RANDOM_STATE).reset_index(drop=True)
    print("Sampled rows:", len(df))

# ---------- Feature engineering (safe) ----------
# parse dates if present
date_cols_to_process = ["Date of Admission", "Discharge Date"]
for c in date_cols_to_process:
    if c in df.columns:
        df[c] = pd.to_datetime(df[c], errors="coerce")

if ("Date of Admission" in df.columns) and ("Discharge Date" in df.columns):
    df["LOS_Days"] = (df["Discharge Date"] - df["Date of Admission"]).dt.days.clip(lower=0).fillna(0)

if "Date of Admission" in df.columns:
    df["Admission_Year"] = pd.to_datetime(df["Date of Admission"], errors="coerce").dt.year.fillna(0).astype(int)
    df["Admission_Month"] = pd.to_datetime(df["Date of Admission"], errors="coerce").dt.month.fillna(0).astype(int)
    df["Admission_DOW"] = pd.to_datetime(df["Date of Admission"], errors="coerce").dt.dayofweek.fillna(0).astype(int)

# drop obvious identifiers if present
for col in ["Name", "Patient ID", "Doctor", "Room Number"]:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# cast object cols to str & strip
for c in df.select_dtypes(include=["object"]).columns:
    df[c] = df[c].astype(str).str.strip()

# ---------- Detect numeric & categorical ----------
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Remove date columns from categorical_cols after processing them
categorical_cols = [col for col in categorical_cols if col not in date_cols_to_process]
numeric_cols = [col for col in numeric_cols if col not in date_cols_to_process]

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)

# ---------- Reduce cardinality for high-cardinality categoricals ----------
for col in list(categorical_cols):
    if df[col].nunique() > TOP_K_CAT:
        df = reduce_cardinality(df, col, top_k=TOP_K_CAT)
        print(f"Reduced cardinality for {col}: now {df[col].nunique()} uniques")

# refresh lists after cardinality reduction (which modifies df in place)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Remove date columns again in case they were re-added somehow (shouldn't happen, but for safety)
categorical_cols = [col for col in categorical_cols if col not in date_cols_to_process]
numeric_cols = [col for col in numeric_cols if col not in date_cols_to_process]


if not numeric_cols and not categorical_cols:
    raise ValueError("No usable numeric or categorical columns found in the dataset.")

# ---------- Preprocessing ----------
ohe = safe_ohe()
scaler = StandardScaler()
transformers = []
if numeric_cols:
    transformers.append(("num", scaler, numeric_cols))
if categorical_cols:
    transformers.append(("cat", ohe, categorical_cols))

preprocessor = ColumnTransformer(transformers=transformers, remainder="drop")
X = preprocessor.fit_transform(df)
print("Feature matrix shape (after encoding):", X.shape)

# Save feature names (useful for explanations)
feature_names = []
if numeric_cols:
    feature_names += numeric_cols
if categorical_cols:
    try:
        cat_enc = preprocessor.named_transformers_["cat"]
        for i, col in enumerate(categorical_cols):
            cats = cat_enc.categories_[i]
            feature_names += [f"{col}__{str(v)}" for v in cats]
    except Exception:
        # fallback: generic names
        feature_names += [f"cat_{i}" for i in range(X.shape[1] - len(feature_names))]

# ---------- Dimensionality reduction ----------
n_pca = min(30, X.shape[1])
if n_pca <= 0:
    raise ValueError("No features after preprocessing.")
pca = PCA(n_components=n_pca, random_state=RANDOM_STATE)
X_pca = pca.fit_transform(X)
print("PCA explained variance (cumulative first 5):", np.cumsum(pca.explained_variance_ratio_)[:5])
X_2d = X_pca[:, :2]
pd.DataFrame(X_2d, columns=["pca1", "pca2"]).to_csv(os.path.join(OUTPUT_DIR, "embed_pca2.csv"), index=False)

# ---------- Clustering ----------
cluster_summary = []

def add_cluster_result(name, labels, X_for_metrics):
    labels_arr = np.asarray(labels)
    unique = np.unique(labels_arr)
    n_clusters = len(unique)
    sil = None
    db = None
    try:
        # compute silhouette/db only on non-noise labels if present
        mask = labels_arr != -1
        if mask.sum() > 1 and len(np.unique(labels_arr[mask])) > 1:
            sil = float(silhouette_score(X_for_metrics[mask], labels_arr[mask]))
            db = float(davies_bouldin_score(X_for_metrics[mask], labels_arr[mask]))
    except Exception:
        sil = None
        db = None
    cluster_summary.append({"method": name, "n_clusters": int(n_clusters), "silhouette": sil, "davies_bouldin": db})

# KMeans (several k)
for k in [3, 4, 5, 6, 8]:
    km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10)
    labs = km.fit_predict(X_pca)
    add_cluster_result(f"KMeans_k{k}", labs, X_pca)
    df[f"kmeans_k{k}_cluster"] = labs
    save_scatter(X_2d, labs, f"KMeans k={k}", f"kmeans_k{k}.png")

# Agglomerative
for k in [3, 4, 5, 8]:
    agg = AgglomerativeClustering(n_clusters=k)
    labs = agg.fit_predict(X_pca)
    add_cluster_result(f"Agglomerative_k{k}", labs, X_pca)
    df[f"agg_k{k}_cluster"] = labs
    save_scatter(X_2d, labs, f"Agglomerative k={k}", f"agg_k{k}.png")

# DBSCAN - eps heuristic from k-distance
nn = NearestNeighbors(n_neighbors=6).fit(X_pca)
distances, _ = nn.kneighbors(X_pca)
kdist = np.sort(distances[:, -1])
eps_guess = float(np.percentile(kdist, 90))
eps = eps_guess if eps_guess > 0 else 0.5
db = DBSCAN(eps=eps, min_samples=5)
labs = db.fit_predict(X_pca)
add_cluster_result(f"DBSCAN_eps{eps:.3f}", labs, X_pca)
df["dbscan_cluster"] = labs
save_scatter(X_2d, labs, f"DBSCAN eps={eps:.3f}", "dbscan.png")

# Gaussian Mixture
for k in [3,4,5,8]:
    gm = GaussianMixture(n_components=k, random_state=RANDOM_STATE, n_init=3)
    labs = gm.fit_predict(X_pca)
    add_cluster_result(f"GMM_k{k}", labs, X_pca)
    df[f"gmm_k{k}_cluster"] = labs
    save_scatter(X_2d, labs, f"GMM k={k}", f"gmm_k{k}.png")

# ---------- Outlier / Anomaly detection ----------
# IsolationForest
iso = IsolationForest(n_estimators=200, contamination=0.01, random_state=RANDOM_STATE)
iso_preds = iso.fit_predict(X_pca)
df["iso_anomaly"] = iso_preds  # -1 anomaly, 1 normal
pd.DataFrame({"iso_pred": iso_preds}).to_csv(os.path.join(OUTPUT_DIR, "isolation_preds.csv"), index=False)

# LocalOutlierFactor
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01, novelty=False)
lof_preds = lof.fit_predict(X_pca)
df["lof_anomaly"] = lof_preds
pd.DataFrame({"lof_pred": lof_preds}).to_csv(os.path.join(OUTPUT_DIR, "lof_preds.csv"), index=False)

# One-Class SVM (may be slow)
try:
    ocsvm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.01)
    ocsvm.fit(X_pca)
    oc_preds = ocsvm.predict(X_pca)
    df["ocsvm_anomaly"] = oc_preds
    pd.DataFrame({"ocsvm_pred": oc_preds}).to_csv(os.path.join(OUTPUT_DIR, "ocsvm_preds.csv"), index=False)
except Exception as e:
    print("OneClassSVM failed:", e)

# ---------- Association rules (optional) ----------
if apriori is not None and association_rules is not None:
    # pick a few categorical cols for association mining if available
    ar_cols = [c for c in ["Medication", "Medical Condition", "Hospital", "Insurance Provider", "Blood Type"] if c in df.columns and c in categorical_cols] # check against updated categorical_cols
    if ar_cols:
        trans = pd.DataFrame(index=df.index)
        for c in ar_cols:
            dummies = pd.get_dummies(df[c].fillna("NA").astype(str), prefix=c)
            trans = pd.concat([trans, dummies], axis=1)
        # limit to not too wide
        if trans.shape[1] > 500:
            trans = trans.iloc[:, :500]
        freq_items = apriori(trans, min_support=0.01, use_colnames=True)
        rules = association_rules(freq_items, metric="lift", min_threshold=1.2)
        freq_items.to_csv(os.path.join(OUTPUT_DIR, "apriori_itemsets.csv"), index=False)
        rules.to_csv(os.path.join(OUTPUT_DIR, "apriori_rules.csv"), index=False)
        print("Saved apriori outputs.")
else:
    print("mlxtend not available - skipping association rules (you can install mlxtend).")

# ---------- Save summaries & outputs ----------
pd.DataFrame(cluster_summary).to_csv(os.path.join(OUTPUT_DIR, "cluster_summary_overview.csv"), index=False)
print("Saved cluster_summary_overview.csv")

# Choose a clustering column to explain (prefer kmeans_k4 if available)
cluster_cols = [c for c in df.columns if c.endswith("_cluster")]
chosen_col = None
for pref in ["kmeans_k4_cluster", "kmeans_k5_cluster", "kmeans_k3_cluster"]:
    if pref in df.columns:
        chosen_col = pref
        break
if chosen_col is None and cluster_cols:
    chosen_col = cluster_cols[0]

if chosen_col:
    print("Explaining clusters using:", chosen_col)
    cluster_feature_summary = []
    for cl in sorted(df[chosen_col].unique()):
        sub = df[df[chosen_col] == cl]
        row = {"cluster": int(cl) if pd.notna(cl) else "nan", "size": len(sub)}
        # numeric means
        for n in numeric_cols:
            if n in sub.columns:
                row[f"mean__{n}"] = sub[n].mean()
        # top categorical
        for c in categorical_cols:
            if c in sub.columns:
                row[f"top__{c}"] = sub[c].value_counts().idxmax() if not sub[c].value_counts().empty else None
        cluster_feature_summary.append(row)
    pd.DataFrame(cluster_feature_summary).to_csv(os.path.join(OUTPUT_DIR, "cluster_feature_summary.csv"), index=False)
    print("Saved cluster_feature_summary.csv")

# Save dataframe with clusters & embeddings
out_df = df.copy()
out_df["pca_dim1"] = X_2d[:,0]
out_df["pca_dim2"] = X_2d[:,1]
out_df.to_csv(os.path.join(OUTPUT_DIR, "df_with_clusters_and_embeddings.csv"), index=False)
print("Saved df_with_clusters_and_embeddings.csv and many PNGs in", OUTPUT_DIR)

print("DONE. Check the folder", OUTPUT_DIR, "for outputs (CSVs + PNGs).")

Input file: Healthcare dataset.zip
Loading CSV: /tmp/healthcare_extract/healthcare_dataset.csv
Rows loaded: 55500
Sampled rows: 4000
Numeric cols: ['Age', 'Billing Amount', 'LOS_Days', 'Admission_Year', 'Admission_Month', 'Admission_DOW']
Categorical cols: ['Gender', 'Blood Type', 'Medical Condition', 'Hospital', 'Insurance Provider', 'Admission Type', 'Medication', 'Test Results']
Reduced cardinality for Hospital: now 51 uniques
Feature matrix shape (after encoding): (4000, 89)
PCA explained variance (cumulative first 5): [0.11510559 0.20632348 0.29667743 0.38537328 0.47255495]
Saved apriori outputs.
Saved cluster_summary_overview.csv
Explaining clusters using: kmeans_k4_cluster
Saved cluster_feature_summary.csv
Saved df_with_clusters_and_embeddings.csv and many PNGs in unsupervised_reports
DONE. Check the folder unsupervised_reports for outputs (CSVs + PNGs).
