In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import joblib

# Load dataset
df = pd.read_csv('Titanic_train.csv')

# Data Preprocessing
X = df.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin'])
y = df['Survived']

# Handle missing values
X["Embarked"] = X["Embarked"].fillna(X["Embarked"].mode()[0])
X["Age"] = X["Age"].fillna(X["Age"].median())
if "Fare" in X.columns:
    X["Fare"] = X["Fare"].fillna(X["Fare"].median())


X = pd.get_dummies(X, drop_first=True)


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Train Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Save trained model and scaler
joblib.dump(model, 'logistic_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

## Train and Save the model

In [None]:
# Evaluate the model on the test set
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8044692737430168
Precision: 0.7833333333333333
Recall: 0.6811594202898551
F1 Score: 0.7286821705426356
ROC-AUC: 0.841106719367589

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.88      0.85       110
           1       0.78      0.68      0.73        69

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

Confusion Matrix:
 [[97 13]
 [22 47]]


## Creating streamlit App




In [None]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m109.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m169.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.50.0


In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import joblib

# Load saved model and scaler
model = joblib.load('logistic_model.pkl')
scaler = joblib.load('scaler.pkl')

st.title('Titanic Survival Prediction')

# Collect user inputs
pclass = st.selectbox('Passenger Class (1 = 1st, 2 = 2nd, 3 = 3rd)', [1, 2, 3])
age = st.number_input('Age', min_value=0, max_value=100, value=30)
sibsp = st.number_input('Number of Siblings/Spouses Aboard', min_value=0, max_value=10, value=0)
parch = st.number_input('Number of Parents/Children Aboard', min_value=0, max_value=10, value=0)
fare = st.number_input('Fare', min_value=0.0, max_value=500.0, value=32.20)
sex = st.selectbox('Sex', ['male', 'female'])
embarked = st.selectbox('Embarked', ['S', 'C', 'Q'])

# Create a DataFrame with the input data, including all columns present during training
input_data = {
    'PassengerId': [0], # Placeholder - PassengerId is not used for prediction, but needed for column alignment
    'Pclass': [pclass],
    'Age': [age],
    'SibSp': [sibsp],
    'Parch': [parch],
    'Fare': [fare],
    'Sex_male': [1 if sex == 'male' else 0],
    'Embarked_Q': [1 if embarked == 'Q' else 0],
    'Embarked_S': [1 if embarked == 'S' else 0]
}

input_df = pd.DataFrame(input_data)

# Reorder columns to match the training data (excluding Survived)
# Get the column order from the training data (excluding 'Survived')
train_cols = pd.read_csv('Titanic_train.csv').drop(columns=['Survived', 'Name', 'Ticket', 'Cabin'])
train_cols = pd.get_dummies(train_cols, drop_first=True).columns.tolist()

input_df = input_df[train_cols]


# Apply scaler and predict
input_scaled = scaler.transform(input_df)
prediction = model.predict(input_scaled)[0]

# Show prediction
st.subheader('Prediction')
st.write('Survived' if prediction == 1 else 'Not Survived')

2025-10-26 11:53:05.868 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-10-26 11:53:05.877 Session state does not function when running a script without `streamlit run`


# Interview questions
1. Difference between Precision and Recall:

>Precision measures how many predicted positives are actually correct (TP / (TP + FP)).

>Recall measures how many actual positives are correctly identified (TP / (TP + FN)).

>Precision focuses on accuracy of positive predictions.

>Recall focuses on completeness of positive predictions.

>Use precision when false positives are costly, recall when false negatives are costly.

2. Cross-Validation and Its Importance:

>Cross-validation splits data into multiple train-test sets to evaluate model stability.

>In K-fold, the dataset is divided into K folds and trained/tested K times.

>It gives a more reliable estimate of model performance.

>Helps detect overfitting and improves generalization.

>Essential in binary classification for balanced and unbiased evaluation.