In [None]:
import sys
import warnings
import joblib
import numpy as np
import pandas as pd

from typing import Dict, Tuple
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# --- Classification models
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, PassiveAggressiveClassifier, RidgeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import NearestCentroid

# --- Regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor, BayesianRidge, LassoLars, PassiveAggressiveRegressor, SGDRegressor, RANSACRegressor, TheilSenRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge

warnings.filterwarnings("ignore")

# ---- Optional external libs (safe fallbacks if missing)
have_xgb = have_lgbm = have_cat = True
try:
    from xgboost import XGBClassifier, XGBRegressor
except Exception:
    have_xgb = False
try:
    from lightgbm import LGBMClassifier, LGBMRegressor
except Exception:
    have_lgbm = False
try:
    from catboost import CatBoostClassifier, CatBoostRegressor
except Exception:
    have_cat = False


# ==========================
# Step 1: Load dataset
# ==========================
def ask_path_and_target() -> Tuple[pd.DataFrame, str]:
    print(">>> Enter the path to your CSV dataset (e.g., C:/path/file.csv):")
    file_path = input().strip().strip('"').strip("'")
    df = pd.read_csv(file_path)
    print("\n‚úÖ Loaded dataset:", file_path)
    print("Shape:", df.shape)
    print("Columns:", list(df.columns))

    print("\n>>> Enter target (label) column name (leave blank to use LAST column):")
    tgt = input().strip()
    if not tgt:
        tgt = df.columns[-1]
        print(f"Using last column as target: '{tgt}'")

    return df, tgt


# ==========================
# Step 2: Clean & split
# ==========================
def basic_clean(df: pd.DataFrame, target: str, task_hint: str) -> pd.DataFrame:
    """Replace inf with NaN, tidy column names, and fill target NaNs."""
    df = df.copy()
    df.columns = [str(c).strip() for c in df.columns]
    df = df.replace([np.inf, -np.inf], np.nan)

    # Fill target NaNs (features are imputed in pipeline)
    if target in df.columns:
        missing = df[target].isnull().sum()
        if missing > 0:
            if task_hint == "classification":
                fill_val = df[target].mode(dropna=True).iloc[0]
            else:
                fill_val = df[target].astype(float).mean()
            df[target] = df[target].fillna(fill_val)
            print(f"‚ö†Ô∏è Filled {missing} missing target values with {fill_val}.")
    return df


def split_X_y(df: pd.DataFrame, target: str):
    if target not in df.columns:
        raise ValueError(f"Target column '{target}' not found.")
    y = df[target]
    X = df.drop(columns=[target])
    return X, y


# ==========================
# Step 3: Task detection (FIXED)
# ==========================
def _is_integer_like_series(y: pd.Series, tol=1e-9) -> bool:
    """True if dtype is integer or all non-null values are very close to integers."""
    if pd.api.types.is_integer_dtype(y):
        return True
    if pd.api.types.is_float_dtype(y):
        s = y.dropna().values
        if s.size == 0:
            return False
        return np.all(np.abs(s - np.round(s)) <= tol)
    return False


def problem_type(y: pd.Series) -> str:
    # Explicit categorical types ‚Üí classification
    if y.dtype == "object" or str(y.dtype).startswith("category"):
        return "classification"

    # Count unique non-null values
    unique = int(y.nunique(dropna=True))
    n = int(len(y))

    # Integers with small cardinality ‚Üí classification (labels like 0/1/2...)
    if _is_integer_like_series(y) and (unique <= 20 or unique <= max(2, int(0.05 * n))):
        return "classification"

    # Otherwise ‚Üí regression (this fixes floats with few unique values being misclassified)
    return "regression"


# ==========================
# Step 4: Preprocessor
# ==========================
def make_preprocessor(X: pd.DataFrame) -> ColumnTransformer:
    cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
    num_cols = [c for c in X.columns if c not in cat_cols]

    # NOTE: OneHotEncoder(sparse_output=False) requires sklearn >=1.2.
    # If your env is older, change to OneHotEncoder(sparse=False).
    cat_tf = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
    num_tf = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=True))
    ])

    return ColumnTransformer(
        transformers=[
            ("num", num_tf, num_cols),
            ("cat", cat_tf, cat_cols),
        ],
        remainder="drop",
        verbose_feature_names_out=False
    )


# ==========================
# Step 5: Model Zoos
# ==========================
def classification_models() -> Dict[str, object]:
    models = {
        "LogisticRegression": LogisticRegression(max_iter=5000),
        "LinearSVC": LinearSVC(),
        "SVC_RBF": SVC(kernel="rbf"),
        "SVC_Poly": SVC(kernel="poly", degree=3),
        "KNN_Classifier": KNeighborsClassifier(),
        "GaussianNB": GaussianNB(),
        "MultinomialNB": MultinomialNB(),
        "BernoulliNB": BernoulliNB(),
        "DecisionTreeClassifier": DecisionTreeClassifier(random_state=42),
        "RandomForestClassifier": RandomForestClassifier(n_estimators=300, random_state=42),
        "ExtraTreesClassifier": ExtraTreesClassifier(n_estimators=300, random_state=42),
        "GradientBoostingClassifier": GradientBoostingClassifier(random_state=42),
        "AdaBoostClassifier": AdaBoostClassifier(random_state=42),
        "BaggingClassifier": BaggingClassifier(random_state=42),
        "LDA": LinearDiscriminantAnalysis(),
        "QDA": QuadraticDiscriminantAnalysis(),
        "MLPClassifier": MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300),
        "PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=42),
        "Perceptron": Perceptron(random_state=42),
        "RidgeClassifier": RidgeClassifier(),
        "SGDClassifier": SGDClassifier(random_state=42),
        "NearestCentroid": NearestCentroid(),
    }
    if have_xgb:
        models["XGBClassifier"] = XGBClassifier(
            n_estimators=300, max_depth=6, learning_rate=0.05,
            subsample=0.9, colsample_bytree=0.9,
            random_state=42, n_jobs=-1, eval_metric="mlogloss", tree_method="hist"
        )
    if have_lgbm:
        models["LGBMClassifier"] = LGBMClassifier(
            n_estimators=500, learning_rate=0.05, random_state=42
        )
    if have_cat:
        models["CatBoostClassifier"] = CatBoostClassifier(
            iterations=500, depth=6, learning_rate=0.05, random_state=42, verbose=False
        )
    return models


def regression_models() -> Dict[str, object]:
    models = {
        "LinearRegression": LinearRegression(),
        "Ridge": Ridge(alpha=1.0),
        "Lasso": Lasso(alpha=0.001, max_iter=10000),
        "ElasticNet": ElasticNet(alpha=0.001, l1_ratio=0.5, max_iter=10000),
        "KernelRidge": KernelRidge(kernel="rbf"),
        "SVR_RBF": SVR(kernel="rbf"),
        "LinearSVR": LinearSVR(),
        "KNN_Regressor": KNeighborsRegressor(),
        "DecisionTreeRegressor": DecisionTreeRegressor(random_state=42),
        "RandomForestRegressor": RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1),
        "ExtraTreesRegressor": ExtraTreesRegressor(n_estimators=400, random_state=42, n_jobs=-1),
        "GradientBoostingRegressor": GradientBoostingRegressor(random_state=42),
        "AdaBoostRegressor": AdaBoostRegressor(random_state=42),
        "HuberRegressor": HuberRegressor(),
        "BayesianRidge": BayesianRidge(),
        "LassoLars": LassoLars(alpha=0.001),
        "PassiveAggressiveRegressor": PassiveAggressiveRegressor(random_state=42),
        "SGDRegressor": SGDRegressor(random_state=42, max_iter=2000),
        "RANSACRegressor": RANSACRegressor(random_state=42),
        "TheilSenRegressor": TheilSenRegressor(random_state=42),
        "MLPRegressor": MLPRegressor(hidden_layer_sizes=(128, 64), max_iter=600),
    }
    if have_xgb:
        models["XGBRegressor"] = XGBRegressor(
            n_estimators=400, max_depth=6, learning_rate=0.05,
            subsample=0.9, colsample_bytree=0.9,
            random_state=42, n_jobs=-1, tree_method="hist"
        )
    if have_lgbm:
        models["LGBMRegressor"] = LGBMRegressor(
            n_estimators=600, learning_rate=0.05, random_state=42
        )
    if have_cat:
        models["CatBoostRegressor"] = CatBoostRegressor(
            iterations=600, depth=6, learning_rate=0.05, random_state=42, verbose=False
        )
    return models


# ==========================
# Step 6: Evaluation
# ==========================
def safe_cv(y: pd.Series, task: str):
    if task == "classification":
        min_class = int(pd.Series(y).value_counts().min())
        n_splits = max(2, min(5, min_class))
        return StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    else:
        return KFold(n_splits=5, shuffle=True, random_state=42)


def evaluate_models(X: pd.DataFrame, y: pd.Series, task: str, debug: bool = False):
    pre = make_preprocessor(X)
    cv = safe_cv(y, task)

    results, failures = [], []
    models = classification_models() if task == "classification" else regression_models()

    # Detect negative features to safely skip MultinomialNB
    try:
        X_preview = pre.fit_transform(X.iloc[:100, :])
        has_negative = np.nanmin(X_preview) < 0
    except Exception:
        has_negative = True

    for name, model in models.items():
        if task == "classification" and name == "MultinomialNB" and has_negative:
            failures.append((name, "Skipped: requires non-negative features"))
            print(f"{name:28s} -> Skipped: requires non-negative features")
            continue

        pipe = Pipeline(steps=[("pre", pre), ("model", model)])
        scoring = "f1_macro" if task == "classification" else "r2"

        try:
            scores = cross_val_score(
                pipe, X, y, cv=cv, scoring=scoring,
                n_jobs=None, error_score=("raise" if debug else np.nan)
            )
            mean_score = float(np.nanmean(scores))
            results.append((name, mean_score))
            print(f"{name:28s} -> {scoring}: {mean_score:.4f}")
        except Exception as e:
            failures.append((name, str(e)))
            print(f"{name:28s} -> ERROR: {e}")

    # Sort by score descending
    results.sort(key=lambda x: (x[1] if x[1] is not None else -np.inf), reverse=True)
    return results, failures


def save_results_to_csv(results, failures, task: str, filename="model_results.csv"):
    metric = "F1-macro" if task == "classification" else "R¬≤"
    rows = [{"Model": n, "Score": s, "Metric": metric, "Status": "Success"} for n, s in results]
    rows += [{"Model": n, "Score": None, "Metric": metric, "Status": f"Failed: {m}"} for n, m in failures]
    pd.DataFrame(rows).to_csv(filename, index=False)
    print(f"\nüíæ Saved results to {filename}")


# ==========================
# Step 7: Main
# ==========================
def main():
    df, target = ask_path_and_target()

    # Initial guess from raw column
    raw_task = problem_type(df[target])

    # Clean with hint (so target imputation uses a sensible strategy)
    df = basic_clean(df, target, raw_task)

    # Final detection on cleaned target
    X, y = split_X_y(df, target)
    task = problem_type(y)

    # Optional manual override
    print(f"\nüîé Detected task: {task.upper()}")
    print("Press Enter to accept, or type 'classification' / 'regression' to override:")
    override = input().strip().lower()
    if override in {"classification", "regression"}:
        task = override
        print(f"‚û°Ô∏è  Overridden task: {task.upper()}")

    # Show quick target summary
    if task == "classification":
        vc = pd.Series(y).value_counts(dropna=False)
        print("\nClass distribution:")
        print(vc.to_string())
    else:
        print(f"\nTarget summary (numeric): count={y.shape[0]}, "
              f"mean={pd.to_numeric(y, errors='coerce').mean():.4f}, "
              f"std={pd.to_numeric(y, errors='coerce').std():.4f}")

    # Evaluate
    results, failures = evaluate_models(X, y, task, debug=False)

    if not results:
        print("\n‚ùå No model succeeded!")
        if failures:
            print("Failures:")
            for n, m in failures:
                print(f"- {n}: {m}")
        sys.exit(1)

    best_name, best_score = results[0]
    metric = "F1-macro" if task == "classification" else "R¬≤"

    print("\n==============================")
    print("üèÜ BEST MODEL")
    print("==============================")
    print(f"Model : {best_name}")
    print(f"Score : {best_score:.4f} ({metric} via cross-validation)")

    print("\nüìä Top 5 models:")
    for i, (n, s) in enumerate(results[:5], 1):
        print(f"{i}. {n:28s} {s:.4f}")

    if failures:
        print("\n‚ÑπÔ∏è Models skipped/failed:")
        for name, msg in failures[:12]:
            print(f"- {name}: {msg}")

    # Save leaderboard
    save_results_to_csv(results, failures, task)

    # Train best model on full data & persist
    zoo = classification_models() if task == "classification" else regression_models()
    best_model = zoo[best_name]
    final_pipe = Pipeline(steps=[("pre", make_preprocessor(X)), ("model", best_model)])
    final_pipe.fit(X, y)
    joblib.dump(final_pipe, "best_model.pkl")
    print("üíæ Saved best model to best_model.pkl")


if __name__ == "__main__":
    main()