In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
TARGET_COLS = ["Label_A", "Label_B", "Label_C"]  # ðŸ”´ change accordingly

X = train.drop(columns=TARGET_COLS)
y = train[TARGET_COLS]
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),   # handles nulls
    ("scaler", StandardScaler())
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),  # handles nulls
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])
base_model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced"
)

model = MultiOutputClassifier(base_model)
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", model)
])
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
pipeline.fit(X_train, y_train)
val_preds = pipeline.predict(X_val)

print("Sample multilabel prediction:")
print(val_preds[:5])
pipeline.fit(X, y)
test_preds = pipeline.predict(test)
submission = pd.DataFrame(
    test_preds,
    columns=TARGET_COLS
)

submission.to_csv("submission.csv", index=False)
print("submission.csv created successfully!")
