In [None]:
# ==========================================================
# IMPORTS
# ==========================================================
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV

# ==========================================================
# CONFIG
# ==========================================================
TARGET_COL = "quality_grade"
ID_COL = "id"
RANDOM_STATE = 42

# ==========================================================
# LOAD DATA
# ==========================================================
train = pd.read_csv("/kaggle/input/mle-ese-mock/train (5).csv")
test  = pd.read_csv("/kaggle/input/mle-ese-mock/test (4).csv")
sample_submission = pd.read_csv("/kaggle/input/mle-ese-mock/submission (6).csv")

# ==========================================================
# TARGET SAFETY (CRITICAL)
# ==========================================================
train = train.dropna(subset=[TARGET_COL]).reset_index(drop=True)

# ==========================================================
# SPLIT FEATURES & TARGET
# ==========================================================
X = train.drop(
    columns=[TARGET_COL] + ([ID_COL] if ID_COL in train.columns else []),
    errors="ignore"
)
y = train[TARGET_COL]

X_test = test.drop(
    columns=[ID_COL] if ID_COL in test.columns else [],
    errors="ignore"
)

# ==========================================================
# COLUMN TYPES
# ==========================================================
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns

# ==========================================================
# PREPROCESSING (LEAKAGE-SAFE)
# ==========================================================
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(
        handle_unknown="ignore",
        min_frequency=5,          # prevents rare-category overfit
        sparse_output=False
    ))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])

# ==========================================================
# BASE MODELS (BIASâ€“VARIANCE BALANCED)
# ==========================================================

# Strong linear baseline (low variance)
log_reg = LogisticRegression(
    max_iter=3000,
    multi_class="multinomial",
    solver="lbfgs",
    C=1.0,                       # regularization
    class_weight="balanced",
    n_jobs=-1
)

# Non-linear model (low bias)
hgb = HistGradientBoostingClassifier(
    max_depth=6,
    learning_rate=0.05,
    max_iter=300,
    l2_regularization=0.1,
    random_state=RANDOM_STATE
)

# ==========================================================
# CALIBRATION (IMPROVES LOG LOSS)
# ==========================================================
log_reg_cal = CalibratedClassifierCV(
    estimator=log_reg,
    method="isotonic",
    cv=3
)

# ==========================================================
# PIPELINES
# ==========================================================
pipelines = {
    "LogisticRegression": Pipeline([
        ("preprocessor", preprocessor),
        ("model", log_reg_cal)
    ]),
    "HistGradientBoosting": Pipeline([
        ("preprocessor", preprocessor),
        ("model", hgb)
    ])
}

# ==========================================================
# CROSS-VALIDATION (ROBUST)
# ==========================================================
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=RANDOM_STATE
)

scoring = {
    "log_loss": "neg_log_loss",
    "accuracy": "accuracy"
}

results = []

for name, pipe in pipelines.items():
    scores = cross_validate(
        pipe,
        X,
        y,
        cv=cv,
        scoring=scoring,
        n_jobs=-1
    )

    results.append([
        name,
        -scores["test_log_loss"].mean(),
        scores["test_accuracy"].mean()
    ])

results_df = pd.DataFrame(
    results,
    columns=["Model", "CV Log Loss", "CV Accuracy"]
).sort_values("CV Log Loss")

print("\nMODEL COMPARISON")
print(results_df)

# ==========================================================
# SELECT BEST MODEL
# ==========================================================
best_model_name = results_df.iloc[0]["Model"]
best_pipeline = pipelines[best_model_name]

print(f"\nBest Model Selected: {best_model_name}")

# ==========================================================
# FINAL TRAINING
# ==========================================================
best_pipeline.fit(X, y)

# ==========================================================
# PROBABILITY PREDICTIONS
# ==========================================================
test_proba = best_pipeline.predict_proba(X_test)

# ==========================================================
# SUBMISSION
# ==========================================================
submission = pd.DataFrame(
    test_proba,
    columns=best_pipeline.named_steps["model"].classes_
)

if "id" in sample_submission.columns:
    submission.insert(0, "id", test["id"].values)

submission = submission[sample_submission.columns]
submission.to_csv("submission.csv", index=False)

print("submission.csv created successfully")
