In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

# =====================
# LOAD DATA
# =====================
train = pd.read_csv("/kaggle/input/mock-test-2-mse-2/tra")
test  = pd.read_csv("/kaggle/input/mock-test-2-mse-2/test.")

TARGET = "Status"   # change if needed

X = train.drop(["id", TARGET], axis=1)
y = train[TARGET]

X_test = test.drop(["id"], axis=1)
test_id = test["id"]

# =====================
# HANDLE MISSING
# =====================
X = X.fillna("missing")
X_test = X_test.fillna("missing")

# =====================
# SAFE ENCODING
# =====================
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)
X, X_test = X.align(X_test, join="left", axis=1, fill_value=0)

# =====================
# ENCODE TARGET
# =====================
le = LabelEncoder()
y_enc = le.fit_transform(y)

# =====================
# TRAIN / VALIDATION SPLIT
# =====================
X_train, X_val, y_train, y_val = train_test_split(
    X, y_enc,
    test_size=0.2,
    random_state=42,
    stratify=y_enc
)

# =====================
# RANDOM FOREST MODEL
# =====================
model = RandomForestClassifier(
    n_estimators=600,
    max_depth=16,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# =====================
# ACCURACY
# =====================
y_val_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {acc:.4f}")

# =====================
# CONFUSION MATRIX (COLORFUL)
# =====================
cm = confusion_matrix(y_val, y_val_pred)

disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=le.classes_
)

plt.figure(figsize=(6, 5))
disp.plot(cmap=plt.cm.viridis, values_format="d")
plt.title("Confusion Matrix (Validation)")
plt.show()

# =====================
# FEATURE IMPORTANCE PLOT (TOP 15)
# =====================
importances = model.feature_importances_
indices = np.argsort(importances)[-15:]

plt.figure(figsize=(8, 6))
plt.barh(range(len(indices)), importances[indices], color="orange")
plt.yticks(range(len(indices)), X.columns[indices])
plt.xlabel("Importance")
plt.title("Top 15 Feature Importances (Random Forest)")
plt.show()

# =====================
# PREDICT ON TEST
# =====================
proba = model.predict_proba(X_test)

# =====================
# SUBMISSION CSV
# =====================
submission = pd.DataFrame({"id": test_id})
for i, cls in enumerate(le.classes_):
    submission[f"{TARGET}_{cls}"] = proba[:, i]

submission.to_csv("submission.csv", index=False)
print("submission_rf.csv saved!")
