In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    log_loss,
    roc_curve
)

In [None]:
# CONFIG
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"
TARGET_COL = "Status"
ID_COL = "id"

MAKE_PROB_SUBMISSION = True
MAKE_LABEL_SUBMISSION = True
USE_LABEL_ENCODER_ON_Y = True

In [None]:
# LOAD DATA
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("\nColumns:", list(train_df.columns))
print("\nMissing values:\n", train_df.isnull().sum())
print("\nTarget distribution:\n", train_df[TARGET_COL].value_counts())

In [None]:
# EDA

plt.figure(figsize=(8,4))
sns.countplot(x=train_df[TARGET_COL])
plt.title("Target Count")
plt.show()

plt.figure(figsize=(8,4))
sns.heatmap(train_df.isnull(), cbar=False)
plt.title("Missing Data")
plt.show()

# Detect types
feature_cols = [c for c in train_df.columns if c not in [TARGET_COL, ID_COL]]
num_cols = train_df[feature_cols].select_dtypes(include=["number"]).columns.tolist()
cat_cols = train_df[feature_cols].select_dtypes(exclude=["number"]).columns.tolist()

print("\nNumeric columns:", num_cols)
print("Categorical columns:", cat_cols)

# Simple Distribution Plots
for col in num_cols:
    plt.figure(figsize=(10,4))
    sns.boxplot(x=train_df[col])
    plt.title(f"Box Plot: {col}")
    plt.show()

# Correlation
if len(num_cols) > 1:
    plt.figure(figsize=(8,5))
    sns.heatmap(train_df[num_cols].corr(), cmap="coolwarm")
    plt.title("Correlation Heatmap")
    plt.show()

In [None]:
# Cleaning
train_df = train_df.drop_duplicates()
train_df = train_df.dropna(subset=[TARGET_COL])

# Outlier Capping
for col in num_cols:
    Q1 = train_df[col].quantile(0.25)
    Q3 = train_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    train_df[col] = np.where(train_df[col] < lower, lower, train_df[col])
    train_df[col] = np.where(train_df[col] > upper, upper, train_df[col])


# Split
X = train_df.drop([TARGET_COL, ID_COL], axis=1)
y = train_df[TARGET_COL]

test_ids = test_df[ID_COL]
X_test = test_df.drop(ID_COL, axis=1)

X, X_test = X.align(X_test, join="left", axis=1)

num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X.select_dtypes(exclude=["number"]).columns.tolist()


# Encode Target
if USE_LABEL_ENCODER_ON_Y:
    le_y = LabelEncoder()
    y = le_y.fit_transform(y)
else:
    le_y = None

# Train Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# Preprocessing
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

In [None]:
# Model + Tuning
rf = RandomForestClassifier(random_state=42, class_weight="balanced")

model = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", rf)
])

param_grid = {
    "clf__n_estimators": [200, 500],
    "clf__max_depth": [None, 10, 20],
    "clf__min_samples_split": [2, 5],
    "clf__min_samples_leaf": [1, 2],
    "clf__max_features": ["sqrt", "log2"]
}

grid_search = GridSearchCV(model, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

print("\nBest hyperparameters:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)


# Validation
y_pred = best_model.predict(X_val)
y_prob = best_model.predict_proba(X_val)

if USE_LABEL_ENCODER_ON_Y:
    y_pred = le_y.inverse_transform(y_pred)
    y_true = le_y.inverse_transform(y_val)
else:
    y_true = y_val

print("\nAccuracy:", accuracy_score(y_true, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))

# ROC-AUC
roc = roc_auc_score(y_val, y_prob, multi_class="ovr")
print("\nROC-AUC:", roc)

In [None]:
# ROC CURVE (Macro)
from sklearn.preprocessing import label_binarize
classes = best_model.named_steps["clf"].classes_
y_bin = label_binarize(y_val, classes=range(len(classes)))

for i in range(len(classes)):
    fpr, tpr, _ = roc_curve(y_bin[:, i], y_prob[:, i])
    plt.plot(fpr, tpr, label=f"{classes[i]}")

plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()

In [None]:
# Train on Full Data
best_model.fit(X, y)


# Test Predictions
test_prob = best_model.predict_proba(X_test)
test_pred = best_model.predict(X_test)

if USE_LABEL_ENCODER_ON_Y:
    test_pred = le_y.inverse_transform(test_pred)

In [None]:
# Submission Files
# Probability Submission
if MAKE_PROB_SUBMISSION:
    sub_prob = pd.DataFrame({ID_COL: test_ids})
    class_labels = le_y.classes_ if USE_LABEL_ENCODER_ON_Y else classes

    for i, c in enumerate(class_labels):
        sub_prob[f"{TARGET_COL}_{c}"] = test_prob[:, i]

    sub_prob.to_csv("submission_probabilities.csv", index=False)
    print("\nSaved submission_probabilities.csv")

# Label Submission
if MAKE_LABEL_SUBMISSION:
    sub_label = pd.DataFrame({
        ID_COL: test_ids,
        TARGET_COL: test_pred
    })
    sub_label.to_csv("submission_labels.csv", index=False)
    print("Saved submission_labels.csv")