In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    average_precision_score
)

# Create results directory if it doesn't exist
os.makedirs("results", exist_ok=True)

# Load dataset
df = pd.read_csv(r"C:\Users\Dell\Downloads\archive (5)\Titanic-Dataset.csv")


# Drop rows with missing Age or Embarked
df = df.dropna(subset=["Age", "Embarked"])

# Encode categorical variables
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
df["Embarked"] = df["Embarked"].map({"C": 0, "Q": 1, "S": 2})

# Define features and target
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
X = df[features]
y = df["Survived"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(random_state=42)

# Train models
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)

# Predict
y_pred_lr = lr.predict(X_test)
y_pred_rf = rf.predict(X_test)

# --- Confusion Matrix (Logistic Regression) ---
cm_lr = confusion_matrix(y_test, y_pred_lr)
disp_lr = ConfusionMatrixDisplay(confusion_matrix=cm_lr)
disp_lr.plot()
plt.title("Confusion Matrix - Logistic Regression")
plt.savefig("results/confusion_matrix_lr.png")
plt.close()

# --- Confusion Matrix (Random Forest) ---
cm_rf = confusion_matrix(y_test, y_pred_rf)
disp_rf = ConfusionMatrixDisplay(confusion_matrix=cm_rf)
disp_rf.plot()
plt.title("Confusion Matrix - Random Forest")
plt.savefig("results/confusion_matrix_rf.png")
plt.close()

# --- Precision-Recall Curve (Logistic Regression) ---
y_scores_lr = lr.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_scores_lr)
avg_precision = average_precision_score(y_test, y_scores_lr)

plt.plot(recall, precision, label=f"Avg Precision = {avg_precision:.2f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve - Logistic Regression")
plt.legend()
plt.savefig("results/precision_recall_curve_lr.png")
plt.close()

# --- Model Comparison ---
acc_lr = lr.score(X_test, y_test)
acc_rf = rf.score(X_test, y_test)

print("Logistic Regression Accuracy:", acc_lr)
print("Random Forest Accuracy:", acc_rf)

if acc_lr > acc_rf:
    print("✅ Logistic Regression performed better.")
elif acc_rf > acc_lr:
    print("✅ Random Forest performed better.")
else:
    print("⚖️ Both models performed equally well.")


# Plot and save Logistic Regression Confusion Matrix
disp_lr = ConfusionMatrixDisplay(confusion_matrix=cm_lr)
disp_lr.plot()
plt.title("Confusion Matrix - Logistic Regression")
plt.savefig("results/confusion_matrix_lr.png")
plt.close()

# Plot and save Random Forest Confusion Matrix
disp_rf = ConfusionMatrixDisplay(confusion_matrix=cm_rf)
disp_rf.plot()
plt.title("Confusion Matrix - Random Forest")
plt.savefig("results/confusion_matrix_rf.png")
plt.close()



Logistic Regression Accuracy: 0.7972027972027972
Random Forest Accuracy: 0.7692307692307693
✅ Logistic Regression performed better.
