In [None]:
# ==========================================================
# IMPORTS
# ==========================================================
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# ==========================================================
# CONFIG
# ==========================================================
TARGET_COL = "Class"
ID_COL = "id"
RANDOM_STATE = 42
MISSING_THRESHOLD = 0.6

# ==========================================================
# LOAD DATA
# ==========================================================
train = pd.read_csv('/kaggle/input/mse-2-ai-201-b-ai-d/train.csv')
test  = pd.read_csv('/kaggle/input/mse-2-ai-201-b-ai-d/test.csv')

train.drop_duplicates(inplace=True)

#  CRITICAL FIX
train = train.dropna(subset=[TARGET_COL]).reset_index(drop=True)

# ==========================================================
# DROP HIGH MISSING COLUMNS
# ==========================================================
missing_ratio = train.isnull().mean()
drop_cols = missing_ratio[missing_ratio > MISSING_THRESHOLD].index

train.drop(columns=drop_cols, inplace=True)
test.drop(columns=drop_cols, inplace=True, errors="ignore")

# ==========================================================
# SPLIT FEATURES & TARGET
# ==========================================================
X = train.drop(TARGET_COL, axis=1)
y = train[TARGET_COL]

X_test_final = test.drop(ID_COL, axis=1)

# ==========================================================
# COLUMN TYPES
# ==========================================================
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

# ==========================================================
# PREPROCESSING
# ==========================================================
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])

# ==========================================================
# MODELS
# ==========================================================
models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        class_weight="balanced",
        random_state=RANDOM_STATE
    ),
    "Gradient Boosting": GradientBoostingClassifier(
        random_state=RANDOM_STATE
    )
}

# ==========================================================
# CROSS-VALIDATION
# ==========================================================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

scoring = {
    "accuracy": "accuracy",
    "roc_auc": "roc_auc_ovr"
}

results = []

for name, model in models.items():

    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    scores = cross_validate(
        pipe,
        X,
        y,
        cv=cv,
        scoring=scoring,
        n_jobs=-1
    )

    results.append([
        name,
        scores["test_accuracy"].mean(),
        scores["test_roc_auc"].mean()
    ])

# ==========================================================
# RESULTS
# ==========================================================
results_df = pd.DataFrame(
    results,
    columns=["Model", "CV Accuracy", "CV ROC-AUC"]
).sort_values("CV ROC-AUC", ascending=False)

print("\n MODEL COMPARISON")
print(results_df)

# ==========================================================
# BEST MODEL
# ==========================================================
best_model_name = results_df.iloc[0]["Model"]
best_model = models[best_model_name]

print(f"\n Best Model Selected: {best_model_name}")

# ==========================================================
# FINAL TRAINING
# ==========================================================
final_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", best_model)
])

final_pipeline.fit(X, y)

# ==========================================================
# SUBMISSION
# ==========================================================
test_pred = final_pipeline.predict(X_test_final)

submission = pd.DataFrame({
    ID_COL: test[ID_COL],
    TARGET_COL: test_pred
})

submission.to_csv("submission.csv", index=False)
print("submission.csv saved ")
