In [None]:
# =========================
# 1. IMPORT LIBRARIES
# =========================

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# =========================
# 2. LOAD DATA
# =========================
from dotenv import load_dotenv
import os
load_dotenv()
data_path = os.getenv("PATH")
df = pd.read_csv(data_path, index_col="PassengerId")

In [None]:
# =========================
# 3. HANDLE MISSING VALUES
# =========================

# Age → mean
df["Age"] = df["Age"].fillna(df["Age"].mean())

# Fare → convert to numeric + median
df["Fare"] = pd.to_numeric(df["Fare"], errors="coerce")
df["Fare"] = df["Fare"].fillna(df["Fare"].median())

# Sex → mode
df["Sex"] = df["Sex"].fillna(df["Sex"].mode()[0])

# Embarked → mode
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])

# Cabin → Deck extraction
df["Deck"] = df["Cabin"].str[0]
df["Deck"] = df["Deck"].fillna("Missing")
df.drop(columns=["Cabin"], inplace=True)

# Ticket → drop rows with missing ticket
df.dropna(subset=["Ticket"], inplace=True)

In [None]:
# =========================
# 4. ENCODING
# =========================
df_copy = df.copy()

# Label Encoding (Sex)
le = LabelEncoder()
df_copy["Sex_encoded"] = le.fit_transform(df_copy["Sex"])
df_copy.drop(columns=["Sex"], inplace=True)

# One-hot encoding (Embarked)
df_copy = pd.get_dummies(df_copy, columns=["Embarked"], drop_first=True)

# Convert bool → int
bool_cols = df_copy.select_dtypes(include="bool").columns
df_copy[bool_cols] = df_copy[bool_cols].astype(int)


In [None]:
# =========================
# 5. DROP STRING COLUMNS
# =========================
df_copy.drop(columns=["Name", "Ticket"], inplace=True)


In [None]:
# =========================
# 6. FORCE NUMERIC TYPES (CRITICAL FIX)
# =========================
df_copy["Pclass"] = pd.to_numeric(df_copy["Pclass"], errors="coerce")
df_copy["Parch"] = pd.to_numeric(df_copy["Parch"], errors="coerce")

In [None]:
# =========================
# 7. DEFINE FEATURES & TARGET
# =========================
X = df_copy[
    ["Pclass", "Age", "SibSp", "Parch", "Fare", "Sex_encoded", "Embarked_Q", "Embarked_S"]
]

y = df_copy["Survived"]
X = X.dropna()
y = y.loc[X.index]



In [None]:
# =========================
# 8. TRAIN TEST SPLIT
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# =========================
# 9. FEATURE SCALING
# =========================
scaler = StandardScaler()

X_train[["Age", "Fare","Pclass","SibSp","Parch"]] = scaler.fit_transform(X_train[["Age", "Fare","Pclass","SibSp","Parch"]])
X_test[["Age", "Fare","Pclass","SibSp","Parch"]] = scaler.transform(X_test[["Age", "Fare","Pclass","SibSp","Parch"]])

In [None]:

# =========================
# 10. TRAIN K-NEAREST NEIGHBORS
# =========================
model = KNeighborsClassifier(n_neighbors=9)  # K is greater > underfitting , K is very small > Overfitting
model.fit(X_train, y_train)

In [None]:
# =========================
# 11. PREDICTIONS
# =========================
model_predictions = model.predict(X_test)

In [None]:
# =========================
# 12. METRICS
# =========================
acc = accuracy_score(y_test, model_predictions)
f1 = f1_score(y_test, model_predictions)
precision = precision_score(y_test, model_predictions)
recall = recall_score(y_test, model_predictions)

print("Accuracy :", acc)
print("F1 Score :", f1)
print("Precision:", precision)
print("Recall   :", recall)


In [None]:

# =========================
# 13. ACTUAL vs PREDICTED
# =========================
comparison_df = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": model_predictions
})

print(comparison_df.head(10))



In [None]:
# =========================
# 14.  CONFUSION MATRIX
# =========================
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, model_predictions)

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()