In [2]:
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report
)

In [3]:
PROCESSED_DIR=Path("../data/processed")

x_train = pd.read_csv(PROCESSED_DIR/"X_train.csv")
y_train = pd.read_csv(PROCESSED_DIR/"y_train.csv").values.ravel()
x_test = pd.read_csv(PROCESSED_DIR/"X_test.csv")
y_test = pd.read_csv(PROCESSED_DIR/"y_test.csv").values.ravel()

x_train.shape, x_test.shape


((712, 8), (179, 8))

In [4]:
model=LogisticRegression(
    max_iter=1000,
    solver='lbfgs'
)

In [5]:
model.fit(x_train, y_train)

LogisticRegression(max_iter=1000)

In [6]:
y_pred=model.predict(x_test)
y_prob=model.predict_proba(x_test)[:,1]

# predict → class (0 or 1)

# predict_proba → probability of survival

In [7]:
accuracy=accuracy_score(y_test, y_pred)
accuracy

0.8044692737430168

In [8]:
confusion_matrix(y_test, y_pred)


array([[98, 12],
       [23, 46]], dtype=int64)

In [9]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.81      0.89      0.85       110
           1       0.79      0.67      0.72        69

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179



In [None]:
coefficients = pd.Series(
    model.coef_[0],
    index=x_train.columns
).sort_values()

coefficients

# How to read this

# Positive coefficient → increases survival probability

# Negative coefficient → decreases survival probability

# Larger magnitude → stronger influence

Sex_male     -2.558590
Pclass       -1.090321
Age          -0.496372
Embarked_S   -0.382368
SibSp        -0.243041
Parch        -0.070582
Fare          0.105642
Embarked_Q    0.277650
dtype: float64

In [12]:
pd.DataFrame({
    "true": y_test[:10],
    "predicted": y_pred[:10],
    "probability": y_prob[:10]
})


# This builds intuition:

# High probability but wrong → interesting error

# Low probability but correct → uncertainty

Unnamed: 0,true,predicted,probability
0,0,0,0.070093
1,0,0,0.049754
2,1,0,0.157383
3,0,0,0.037141
4,1,1,0.669279
5,1,0,0.436402
6,1,1,0.744199
7,0,0,0.32882
8,0,0,0.35167
9,0,0,0.16398
