In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score
train = pd.read_csv("/kaggle/input/multilabel/train (2).csv")
test = pd.read_csv("/kaggle/input/multilabel/test (2).csv")
TARGET_COLS = ["Label_A", "Label_B", "Label_C"]

X = train.drop(columns=TARGET_COLS)
y = train[TARGET_COLS]
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
models = {
    "RandomForest": RandomForestClassifier(
        n_estimators=200, random_state=42, class_weight="balanced"
    ),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=200, random_state=42, class_weight="balanced"
    ),
    "LogisticRegression": LogisticRegression(
        max_iter=2000, solver="lbfgs"
    )
}
scores = {}

for name, base_model in models.items():
    print(f"\nTraining {name}...")

    model = MultiOutputClassifier(base_model)

    pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("model", model)
    ])

    pipeline.fit(X_train, y_train)
    val_preds = pipeline.predict(X_val)

    score = f1_score(y_val, val_preds, average="micro")
    scores[name] = score

    print(f"{name} Micro-F1 Score: {score:.4f}")
best_model_name = max(scores, key=scores.get)
print("\nBest Model:", best_model_name)
best_base_model = models[best_model_name]
best_model = MultiOutputClassifier(best_base_model)

final_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", best_model)
])

final_pipeline.fit(X, y)
test_preds = final_pipeline.predict(test)
submission = pd.DataFrame(
    test_preds,
    columns=TARGET_COLS
)

submission.to_csv("submission.csv", index=False)
print("submission.csv created with best model!")
