In [None]:
# 1 importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# 2 Dealing with data set
train=pd.read_csv("/kaggle/input/mse-2-ai-201-b-ai-d/train.csv")
test=pd.read_csv("/kaggle/input/mse-2-ai-201-b-ai-d/test.csv")


TARGET = "Class"
# Remove rows where target is missing
train = train.dropna(subset=[TARGET])

X = train.drop(columns=[TARGET])
y = train[TARGET]


# 3. Identify column types
num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include=['object','category']).columns

# 4. Preprocessing and  Model Pipeline
preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="constant", fill_value="_MISS_")),
            ("ohe", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)

model = RandomForestClassifier(n_estimators=300, random_state=42)

pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", model)
])


# 5. Train ,Test Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_val)

# 6. Evaluation (Accuracy + F1 + Confusion Matrix)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("F1-score (macro):", f1_score(y_val, y_pred, average="macro"))

cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 7. Predict Test & Save Submission
pred_test = pipe.predict(test)

submission = pd.DataFrame({
    "id": test["id"],
    TARGET: pred_test
}) if "id" in test.columns else pd.DataFrame({TARGET: pred_test})

submission.to_csv("submit.csv", index=False)
print("submit.csv created successfully!")


