In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import accuracy_score, log_loss, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


# Config
TRAIN_PATH = "train.csv"
TEST_PATH  = "test.csv"
TARGET     = "Status"
ID_COL     = "id"


# Load Data
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

train = train.dropna(subset=[TARGET])

print("Train shape:", train.shape)
print("Test shape :", test.shape)


# EDA
print("\nTarget distribution:")
print(train[TARGET].value_counts())
print("\nTarget distribution (ratio):")
print(train[TARGET].value_counts(normalize=True))

plt.figure(figsize=(5,3))
sns.countplot(x=train[TARGET])
plt.title("Target Distribution")
plt.show()

plt.figure(figsize=(10,4))
sns.heatmap(train.isnull(), cbar=False)
plt.title("Missing Values Overview")
plt.show()


# Visualisation
num_cols = train.select_dtypes(include="number").columns.drop(ID_COL, errors="ignore")
cat_cols = train.select_dtypes(exclude="number").columns.drop(TARGET, errors="ignore")

print("\nNumeric Feature Summary:")
display(train[num_cols].describe().T)

for col in num_cols[:5]:
    plt.figure(figsize=(4,3))
    sns.histplot(train[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()

for col in num_cols[:3]:
    plt.figure(figsize=(4,3))
    sns.boxplot(x=train[TARGET], y=train[col])
    plt.title(f"{col} vs {TARGET}")
    plt.show()

if len(num_cols) > 1:
    plt.figure(figsize=(8,6))
    sns.heatmap(train[num_cols].corr(), cmap="coolwarm", center=0)
    plt.title("Numeric Feature Correlation")
    plt.show()

print("\nCategorical Cardinality:")
for col in cat_cols:
    print(f"{col}: {train[col].nunique()} unique values")

if len(cat_cols) > 0:
    plt.figure(figsize=(6,3))
    sns.countplot(data=train, x=cat_cols[0], hue=TARGET)
    plt.xticks(rotation=30)
    plt.title(f"{cat_cols[0]} vs {TARGET}")
    plt.show()

X = train.drop([TARGET, ID_COL], axis=1)
y = train[TARGET]
X_test = test.drop(ID_COL, axis=1)
test_ids = test[ID_COL]


# Encoding
le = LabelEncoder()
y = le.fit_transform(y)

n_classes = len(le.classes_)
print("\nClasses:", list(le.classes_))
print("Number of classes:", n_classes)


# Preprocessing
preprocess = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ]), num_cols),

    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
])


# Models
models = {
    "LogisticRegression": LogisticRegression(
        max_iter=1000,
        multi_class="auto"
    ),
    "GradientBoosting": GradientBoostingClassifier(
        random_state=42
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        class_weight="balanced"
    )
}


# Train/Validate
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

scores = {}

for name, model in models.items():
    pipe = Pipeline([
        ("prep", preprocess),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)

    preds = pipe.predict(X_val)
    probs = pipe.predict_proba(X_val)

    acc = accuracy_score(y_val, preds)
    scores[name] = acc

    print(f"\n{name}")
    print(f"Accuracy: {acc:.4f}")

    # ----- Binary classification -----
    if n_classes == 2:
        auc = roc_auc_score(y_val, probs[:, 1])
        print(f"ROC-AUC : {auc:.4f}")

        plt.figure(figsize=(4,3))
        sns.histplot(probs[:, 1], bins=30)
        plt.title(f"{name} Validation Probabilities")
        plt.show()

    # ----- Multi-class classification -----
    else:
        ll = log_loss(y_val, probs)
        print(f"Log Loss: {ll:.4f}")


# Best Model
best_name = max(scores, key=scores.get)
best_model = models[best_name]

print(f"\nBest model: {best_name}")

final_pipe = Pipeline([
    ("prep", preprocess),
    ("model", best_model)
])

final_pipe.fit(X, y)


# Test Predictions
test_preds = final_pipe.predict(X_test)
test_preds_label = le.inverse_transform(test_preds)

test_probs = final_pipe.predict_proba(X_test)


# SUBMISSION (CLASS LABELS)
submission = pd.DataFrame({
    ID_COL: test_ids,
    TARGET: test_preds_label
})

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")


# SUBMISSION (PROBABILITIES)
prob_cols = [f"{TARGET}_{cls}" for cls in le.classes_]

prob_submission = pd.DataFrame(
    test_probs,
    columns=prob_cols
)

prob_submission.insert(0, ID_COL, test_ids)
prob_submission.to_csv("submission_proba.csv", index=False)

print("Saved submission_proba.csv")
