In [None]:
columns = selected_rfe =['C8', 'C1', 'C5', 'C4', 'N7', 'C9', 'C3', 'C10', 'C2', 'N10', 'O7', 'E6', 'A10', 'age_group_45+', 'age_group_18-30']

In [None]:
#i want to build advanced classification models using these features.The best model to use wouldl be 


In [None]:
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Advanced multiclass modeling cell
# Chooses a robust tree-based multiclass model (LightGBM if available, otherwise sklearn's HistGradientBoosting)
# Reason: gradient-boosted trees handle heterogeneous tabular features well, are robust to scaling,
# provide feature importances, and perform strongly on multiclass problems with moderate tuning.


# Use LightGBM if installed, otherwise fallback to sklearn's HistGradientBoostingClassifier
try:
    ModelClass = LGBMClassifier
    default_params = {
        "objective": "multiclass",
        "num_class": 3,
        "n_jobs": -1,
        "random_state": 42
    }
except Exception:
    ModelClass = HistGradientBoostingClassifier
    default_params = {"random_state": 42}

# Ensure dataset 'df' exists in the notebook environment
if 'df' not in globals():
    raise RuntimeError("DataFrame 'df' not found. Please load your dataset into a variable named `df` before running this cell.")

# Determine original grit scores (0-5) column
possible_original_cols = ['grit_original', 'grit_score', 'grit_0_5', 'original_grit', 'grit']
original_col = next((c for c in possible_original_cols if c in df.columns), None)

if original_col is None:
    # If only a binary 'highgrit' exists, cannot reconstruct 0-5 without additional data/mapping
    if 'highgrit' in df.columns and set(df['highgrit'].dropna().unique()).issubset({0,1}):
        raise RuntimeError(
            "Only binary 'highgrit' found. To map to 3 classes you need the original 0-5 grit scores "
            "or a mapping rule. Please provide the original column name or a mapping."
        )
    else:
        raise RuntimeError(
            "Could not find original grit column. Expected one of "
            f"{possible_original_cols}. Please provide the original 0-5 grit column in `df`."
        )

# Map original 0-5 grit to 3 classes: 1=low, 2=medium, 3=high
# Default mapping (adjust thresholds if you prefer different bins):
# low: 0-1, medium: 2-3, high: 4-5
mapping = {0:1, 1:1, 2:2, 3:2, 4:3, 5:3}
if not set(df[original_col].dropna().unique()).issubset(set(mapping.keys())):
    raise RuntimeError(f"Values in {original_col} exceed expected 0-5 range. Inspect data before proceeding.")

df = df.copy()
df['grit_class'] = df[original_col].map(mapping).astype(int)

# Use the selected features defined in earlier cell (variable `columns`)
if 'columns' not in globals():
    raise RuntimeError("Variable `columns` (selected features) not found. Ensure it is defined in a previous cell.")
X = df[columns].copy()
y = df['grit_class']

# Basic preprocessing: simple imputation + scaling for numeric features
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in X.columns if c not in numeric_cols]


preprocessors = []
if numeric_cols:
    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    preprocessors.append(("num", numeric_pipeline, numeric_cols))
if categorical_cols:
    cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])
    preprocessors.append(("cat", cat_pipeline, categorical_cols))

col_transformer = ColumnTransformer(preprocessors, remainder="drop")

# Compute class weights to help with imbalance
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
class_weight_dict = {cls: w for cls, w in zip(np.unique(y), class_weights)}

# Build pipeline with model
model_kwargs = default_params.copy()
# LightGBM accepts 'class_weight' as dict; sklearn HGBClassifier uses 'class_weight' differently (not supported),
# so pass class_weight only if model supports it.
if ModelClass is LGBMClassifier:
    model_kwargs["class_weight"] = class_weight_dict

pipeline = Pipeline([
    ("pre", col_transformer),
    ("clf", ModelClass(**model_kwargs))
])

# Quick hyperparameter search space (RandomizedSearchCV)
param_dist = {
    "clf__n_estimators": [100, 300, 800],
    "clf__learning_rate": [0.01, 0.05, 0.1] if ModelClass is LGBMClassifier else [None],
    "clf__max_depth": [3, 6, 12, -1] if ModelClass is LGBMClassifier else [3, 6, None],
    "clf__num_leaves" if ModelClass is LGBMClassifier else "clf__max_iter": [31, 63, 127]
}

# Clean param_dist for incompatible keys with fallback model
clean_param_dist = {}
for k, v in param_dist.items():
    if isinstance(k, str) and v is not None:
        clean_param_dist[k] = v

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
search = RandomizedSearchCV(pipeline, clean_param_dist, n_iter=12, cv=cv, scoring="f1_macro", n_jobs=-1, random_state=42, verbose=1)

# Fit search
search.fit(X, y)

# Cross-validated predictions and evaluation
y_pred = cross_val_predict(search.best_estimator_, X, y, cv=cv, method="predict", n_jobs=-1)
print("Best params:", search.best_params_)
print("\nClassification report (cross-validated):")
print(classification_report(y, y_pred, digits=4))
print("Confusion matrix:")
print(confusion_matrix(y, y_pred))

# Feature importance (if available)
best_clf = search.best_estimator_.named_steps['clf']
try:
    importances = None
    if hasattr(best_clf, "feature_importances_"):
        # get transformed feature names
        feature_names = []
        # numeric names
        if numeric_cols:
            feature_names.extend(numeric_cols)
        # one-hot encoded categorical names
        if categorical_cols:
            ohe = search.best_estimator_.named_steps['pre'].transformers_[1][1].named_steps['ohe']
            cat_feature_names = ohe.get_feature_names_out(categorical_cols).tolist()
            feature_names.extend(cat_feature_names)
        importances = best_clf.feature_importances_
        feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False).head(30)
        print("\nTop feature importances:")
        print(feat_imp)
except Exception:
    pass

# End of cell.
# NOTES / ACTIONS NEEDED FROM YOU:
# - Confirm the mapping thresholds (0-1 -> low, 2-3 -> medium, 4-5 -> high) or provide your preferred bins.
# - If the original 0-5 grit column has a different name, tell me the column name or supply it in `df`.
# - Provide any additional preprocessing choices (e.g., handling specific categorical encodings).