In [None]:
!pip install imbalanced-learn


In [53]:
import pandas as pd

# Read the source CSV file
df = pd.read_csv('uncleaned_ml_neg_1.csv', index_col=0)

# Sample 241 random rows and reset index in one line
random_sample = df.sample(n=241, random_state=42).reset_index(drop=True)

# Save the random sample to a new CSV file
random_sample.to_csv('uncleaned_ml_neg_1_1st.csv', index=False)

print(f"Successfully copied {len(random_sample)} random rows to uncleaned_ml_neg_1_1st.csv")
print(f"Original dataset had {len(df)} rows")

Successfully copied 241 random rows to uncleaned_ml_neg_1_1st.csv
Original dataset had 2411 rows


In [52]:
random_sample

Unnamed: 0,Longitude,Latitude,Year_final,Lake_area_calculated_ha,Elevation_m,Lake_type_simplified,is_supraglacial,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m,5y_expansion_rate,10y_expansion_rate,GLOF,lake_area_filled_from_db
2010,95.887,29.294,2019,20.361991,3517,other,0,32.199462,0.592680,0,0,1999.391814,4702.0,0.360256,3.022965,0,False
2083,96.193,29.030,2019,75.760025,4235,other,0,87.304046,0.000000,1,3,0.000000,4646.0,6.374962,9.279712,0,False
2398,96.938,27.712,2019,19.319626,3751,other,0,,,0,0,,,1.844508,3.673105,0,False
2302,97.130,28.846,2019,38.855133,4446,other,0,7.365499,0.044741,0,0,2235.084628,4568.0,0.408642,1.682899,0,False
792,90.436,29.484,2019,15.713443,5332,other,0,3.210925,0.090760,0,0,4991.166149,5779.0,0.797383,1.767181,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1221,90.328,28.005,2019,34.884857,5376,moraine,0,139.599206,0.000000,1,1,0.000000,5526.0,1.141684,0.253706,0,False
1624,92.783,29.656,2019,34.332590,5123,other,0,19.409695,0.660315,0,0,729.954619,5608.0,0.436968,-0.187271,0,False
342,81.399,30.302,2019,169.589590,5580,moraine,0,3179.339043,0.000000,1,19,0.000000,6337.0,0.542784,10.499812,0,False
1170,90.843,28.018,2019,20.214728,4957,moraine,0,2420.372785,0.000000,1,3,0.000000,5188.0,0.792718,1.204929,0,False


Unnamed: 0,Longitude,Latitude,Year_final,Lake_area_calculated_ha,Elevation_m,Lake_type_simplified,is_supraglacial,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m,5y_expansion_rate,10y_expansion_rate,GLOF,lake_area_filled_from_db
0,95.887,29.294,2019,20.361991,3517,other,0,32.199462,0.592680,0,0,1999.391814,4702.0,0.360256,3.022965,0,False
1,96.193,29.030,2019,75.760025,4235,other,0,87.304046,0.000000,1,3,0.000000,4646.0,6.374962,9.279712,0,False
2,96.938,27.712,2019,19.319626,3751,other,0,,,0,0,,,1.844508,3.673105,0,False
3,97.130,28.846,2019,38.855133,4446,other,0,7.365499,0.044741,0,0,2235.084628,4568.0,0.408642,1.682899,0,False
4,90.436,29.484,2019,15.713443,5332,other,0,3.210925,0.090760,0,0,4991.166149,5779.0,0.797383,1.767181,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,90.328,28.005,2019,34.884857,5376,moraine,0,139.599206,0.000000,1,1,0.000000,5526.0,1.141684,0.253706,0,False
237,92.783,29.656,2019,34.332590,5123,other,0,19.409695,0.660315,0,0,729.954619,5608.0,0.436968,-0.187271,0,False
238,81.399,30.302,2019,169.589590,5580,moraine,0,3179.339043,0.000000,1,19,0.000000,6337.0,0.542784,10.499812,0,False
239,90.843,28.018,2019,20.214728,4957,moraine,0,2420.372785,0.000000,1,3,0.000000,5188.0,0.792718,1.204929,0,False


In [56]:
# === GLOF ML: combine CSVs, preprocess, train LR / RF / XGBoost, report metrics ===
# Expects these files in the SAME FOLDER as this notebook:
#   uncleaned_ml_pos_1.csv        (positives, 241 rows)
#   uncleaned_ml_neg_1_1st.csv    (negatives, 241 rows)
#
# Outputs (same folder):
#   uncleaned_ml_1st.csv
#   preds_LogisticRegression.csv
#   preds_RandomForest.csv
#   preds_XGBoost.csv   (if xgboost installed)
#   model_metrics.csv

from pathlib import Path
import warnings
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report
)

# Optional: XGBoost
try:
    from xgboost import XGBClassifier
    XGB_AVAILABLE = True
except Exception:
    XGB_AVAILABLE = False

warnings.filterwarnings("ignore")

HERE = Path(".").resolve()
POS_PATH = HERE / "uncleaned_ml_pos_1.csv"
NEG_PATH = HERE / "uncleaned_ml_neg_1_1st.csv"
COMBINED_OUT = HERE / "uncleaned_ml_1st.csv"

def load_and_prepare(pos_path: Path, neg_path: Path) -> pd.DataFrame:
    if not pos_path.exists() or not neg_path.exists():
        raise FileNotFoundError(f"Missing CSV(s):\n- {pos_path}\n- {neg_path}")

    pos = pd.read_csv(pos_path)
    neg = pd.read_csv(neg_path)

    # Drop accidental index cols like "Unnamed: 0"
    pos = pos.loc[:, ~pos.columns.str.contains(r"^Unnamed")]
    neg = neg.loc[:, ~neg.columns.str.contains(r"^Unnamed")]

    # Create/normalize target column
    if "GLOF" not in pos.columns:
        pos["GLOF"] = 1
    else:
        pos["GLOF"] = (pos["GLOF"] > 0).astype(int)

    if "GLOF" not in neg.columns:
        neg["GLOF"] = 0
    else:
        neg["GLOF"] = (neg["GLOF"] > 0).astype(int)

    # Debug source (removed from features later)
    pos["__source__"] = "pos"
    neg["__source__"] = "neg"

    # Column-union alignment (handles schema mismatches)
    all_cols = sorted(set(pos.columns) | set(neg.columns))
    pos = pos.reindex(columns=all_cols)
    neg = neg.reindex(columns=all_cols)

    df = pd.concat([pos, neg], ignore_index=True)
    df.to_csv(COMBINED_OUT, index=False)
    print(f"✅ Combined saved → {COMBINED_OUT}  shape={df.shape}")
    print("Class counts:\n", df["GLOF"].value_counts(dropna=False))
    return df

def build_preprocessor(X: pd.DataFrame) -> ColumnTransformer:
    cols = [c for c in X.columns if c != "__source__"]
    Xw = X[cols]

    num_cols = Xw.select_dtypes(include=["number"]).columns.tolist()
    cat_cols = Xw.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median", add_indicator=True)),  # automatic imputer (+missing flags)
        ("scale",   RobustScaler())                                         # outlier-robust scaler
    ])
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe",     OneHotEncoder(drop="first", handle_unknown="ignore"))   # one-hot encoder
    ])

    return ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols),
        ],
        remainder="drop",
        verbose_feature_names_out=False
    )

def train_and_eval(df: pd.DataFrame, target="GLOF", test_size=0.2, seed=42):
    df = df.dropna(subset=[target]).copy()
    X = df.drop(columns=[target, "__source__"], errors="ignore")
    y = df[target].astype(int)

    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=seed
    )

    pre = build_preprocessor(Xtr)

    models = {}

    # Logistic Regression (class_weight to be safe)
    models["LogisticRegression"] = Pipeline([
        ("pre", pre),
        ("clf", LogisticRegression(
            solver="liblinear", penalty="l2",
            class_weight="balanced",
            max_iter=2000, random_state=seed
        ))
    ])

    # Random Forest
    models["RandomForest"] = Pipeline([
        ("pre", pre),
        ("clf", RandomForestClassifier(
            n_estimators=500,
            class_weight="balanced_subsample",
            random_state=seed, n_jobs=-1
        ))
    ])

    # XGBoost (if available)
    if XGB_AVAILABLE:
        pos = int((ytr == 1).sum()); neg = int((ytr == 0).sum())
        spw = max(1.0, neg / max(1, pos))
        models["XGBoost"] = Pipeline([
            ("pre", pre),
            ("clf", XGBClassifier(
                n_estimators=500, learning_rate=0.05, max_depth=6,
                subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
                eval_metric="logloss", random_state=seed, n_jobs=-1,
                scale_pos_weight=spw
            ))
        ])
    else:
        print("ℹ️ xgboost not installed; skipping.")

    rows = []
    for name, pipe in models.items():
        print(f"\n▶️ Training {name} ...")
        pipe.fit(Xtr, ytr)
        yhat = pipe.predict(Xte)

        # probs for ROC-AUC if supported
        try:
            yproba = pipe.predict_proba(Xte)[:, 1]
            roc = roc_auc_score(yte, yproba)
        except Exception:
            yproba = None
            roc = np.nan

        prec = precision_score(yte, yhat, zero_division=0)
        rec  = recall_score(yte, yhat, zero_division=0)
        f1   = f1_score(yte, yhat, zero_division=0)
        cm   = confusion_matrix(yte, yhat)

        print(f"{name} — classification report:\n{classification_report(yte, yhat, digits=3, zero_division=0)}")
        tn, fp, fn, tp = cm.ravel()
        print(f"{name} — confusion matrix (rows=true, cols=pred):\n[[TN={tn}, FP={fp}],\n [FN={fn}, TP={tp}]]")

        # Save predictions
        out = pd.DataFrame({"y_true": yte.values, "y_pred": yhat})
        if yproba is not None: out["y_proba"] = yproba
        out_path = HERE / f"preds_{name}.csv"
        out.to_csv(out_path, index=False)
        print(f"💾 Saved predictions → {out_path}")

        rows.append({"model": name, "precision": prec, "recall": rec, "f1": f1, "roc_auc": roc, "test_n": len(yte)})

    metrics = pd.DataFrame(rows).sort_values(["recall","f1","precision"], ascending=False).reset_index(drop=True)
    metrics_path = HERE / "model_metrics.csv"
    metrics.to_csv(metrics_path, index=False)
    print(f"\n✅ Metrics saved → {metrics_path}\n{metrics}")
    return metrics

# Run
df = load_and_prepare(POS_PATH, NEG_PATH)
_ = train_and_eval(df, target="GLOF", test_size=0.2, seed=42)


✅ Combined saved → /Users/nimajsherpa/JupyterNotebook_GLOF/uncleaned_ml_1st.csv  shape=(482, 18)
Class counts:
 GLOF
1    241
0    241
Name: count, dtype: int64
ℹ️ xgboost not installed; skipping.

▶️ Training LogisticRegression ...
LogisticRegression — classification report:
              precision    recall  f1-score   support

           0      0.891     1.000     0.942        49
           1      1.000     0.875     0.933        48

    accuracy                          0.938        97
   macro avg      0.945     0.938     0.938        97
weighted avg      0.945     0.938     0.938        97

LogisticRegression — confusion matrix (rows=true, cols=pred):
[[TN=49, FP=0],
 [FN=6, TP=42]]
💾 Saved predictions → /Users/nimajsherpa/JupyterNotebook_GLOF/preds_LogisticRegression.csv

▶️ Training RandomForest ...
RandomForest — classification report:
              precision    recall  f1-score   support

           0      0.942     1.000     0.970        49
           1      1.000     0.938 