In [None]:
# ==========================================================
# IMPORTS
# ==========================================================
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import make_scorer, f1_score, hamming_loss

# ==========================================================
# CONFIG
# ==========================================================
LABEL_COLS = [
    "Computer Science",
    "Physics",
    "Mathematics",
    "Statistics",
    "Quantitative Biology",
    "Quantitative Finance"
]

ID_COL = "id"
RANDOM_STATE = 42
MISSING_THRESHOLD = 0.6

# ==========================================================
# LOAD DATA
# ==========================================================
train = pd.read_csv('/content/train.csv')
test  = pd.read_csv('/content/test.csv')

train.drop_duplicates(inplace=True)

# ==========================================================
# DROP ROWS WITH ALL LABELS MISSING
# ==========================================================
train = train.dropna(subset=LABEL_COLS, how="all").reset_index(drop=True)

# ==========================================================
# DROP HIGH MISSING COLUMNS
# ==========================================================
missing_ratio = train.isnull().mean()
drop_cols = missing_ratio[missing_ratio > MISSING_THRESHOLD].index

train.drop(columns=drop_cols, inplace=True)
test.drop(columns=drop_cols, inplace=True, errors="ignore")

# ==========================
# SPLIT FEATURES & TARGET
# ==========================
X = train.drop(columns=LABEL_COLS + [ID_COL], errors="ignore")
y = train[LABEL_COLS]

X_test_final = test.drop(columns=[ID_COL], errors="ignore")


# ==========================================================
# COLUMN TYPES
# ==========================================================
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

# ==========================================================
# PREPROCESSING
# ==========================================================
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])

# ==========================================================
# MODELS (MULTI-LABEL WRAPPED)
# ==========================================================
models = {
    "Logistic Regression": MultiOutputClassifier(
        LogisticRegression(max_iter=1000)
    ),
    "Random Forest": MultiOutputClassifier(
        RandomForestClassifier(
            n_estimators=300,
            random_state=RANDOM_STATE
        )
    )
}

# ==========================================================
# CROSS-VALIDATION (NO STRATIFICATION)
# ==========================================================
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

scoring = {
    "micro_f1": make_scorer(f1_score, average="micro"),
    "hamming": make_scorer(hamming_loss, greater_is_better=False)
}

results = []

for name, model in models.items():

    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    scores = cross_validate(
        pipe,
        X,
        y,
        cv=cv,
        scoring=scoring,
        n_jobs=-1
    )

    results.append([
        name,
        scores["test_micro_f1"].mean(),
        -scores["test_hamming"].mean()
    ])

# ==========================================================
# RESULTS
# ==========================================================
results_df = pd.DataFrame(
    results,
    columns=["Model", "CV Micro-F1", "CV (1 - Hamming Loss)"]
).sort_values("CV Micro-F1", ascending=False)

print("\nðŸ“Š MODEL COMPARISON (MULTI-LABEL)")
print(results_df)

# ==========================================================
# BEST MODEL
# ==========================================================
best_model_name = results_df.iloc[0]["Model"]
best_model = models[best_model_name]

print(f"\nâœ… Best Model Selected: {best_model_name}")

# ==========================================================
# FINAL TRAINING
# ==========================================================
final_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", best_model)
])

final_pipeline.fit(X, y)

# ==========================================================
# SUBMISSION
# ==========================================================
test_pred = final_pipeline.predict(X_test_final)

submission = pd.DataFrame(test_pred, columns=LABEL_COLS)
submission.to_csv("submission.csv", index=False)

print("submission.csv saved âœ…")
