# Titanic Survival Prediction Model - Logistic Regression

This notebook builds a machine learning model to predict whether a passenger survived the Titanic disaster.

## 1. Load and Explore the Data

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# Load the Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nStatistical Summary:")
print(df.describe())

## 2. Data Preprocessing and Feature Engineering

In [None]:
# Select relevant features and target variable
# Features: Pclass, Sex, Age, SibSp, Fare (5 features as required)
features_to_use = ['Pclass', 'Sex', 'Age', 'SibSp', 'Fare']
target = 'Survived'

# Create a working copy
df_working = df[features_to_use + [target]].copy()

print("Working dataset shape:", df_working.shape)
print("\nMissing values in selected features:")
print(df_working.isnull().sum())

In [None]:
# Handle missing values
# Fill Age with median
df_working['Age'].fillna(df_working['Age'].median(), inplace=True)

# Fill Fare with median
df_working['Fare'].fillna(df_working['Fare'].median(), inplace=True)

# Drop any remaining rows with missing values
df_working.dropna(inplace=True)

print("Missing values after handling:")
print(df_working.isnull().sum())
print("\nDataset shape after cleaning:", df_working.shape)

In [None]:
# Encode categorical variable (Sex)
# Male = 1, Female = 0
df_working['Sex'] = df_working['Sex'].map({'male': 1, 'female': 0})

print("Data after encoding categorical variables:")
print(df_working.head())
print("\nData types:")
print(df_working.dtypes)

In [None]:
# Prepare features (X) and target (y)
X = df_working[['Pclass', 'Sex', 'Age', 'SibSp', 'Fare']]
y = df_working[target]

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("\nTarget distribution:")
print(y.value_counts())
print("\nTarget proportions:")
print(y.value_counts(normalize=True))

## 3. Split Data into Training and Testing Sets

In [None]:
# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])
print("\nTraining set target distribution:")
print(y_train.value_counts())
print("\nTesting set target distribution:")
print(y_test.value_counts())

In [None]:
# Feature scaling (standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaled training data shape:", X_train_scaled.shape)
print("Scaled testing data shape:", X_test_scaled.shape)
print("\nScaled data sample (first 5 rows):")
print(X_train_scaled[:5])

## 4. Build and Train a Machine Learning Model

In [None]:
# Create and train the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)

print("Model trained successfully!")
print("\nModel coefficients:")
for feature, coef in zip(['Pclass', 'Sex', 'Age', 'SibSp', 'Fare'], model.coef_[0]):
    print(f"{feature}: {coef:.4f}")
print(f"\nIntercept: {model.intercept_[0]:.4f}")

## 5. Evaluate Model Performance

In [None]:
# Make predictions on training and testing data
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("="*50)
print("MODEL PERFORMANCE EVALUATION")
print("="*50)
print(f"\nTraining Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

In [None]:
# Detailed classification report for test set
print("\nClassification Report (Test Set):")
print("="*50)
print(classification_report(y_test, y_test_pred, 
                          target_names=['Did Not Survive', 'Survived']))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix (Test Set):")
print("="*50)
print(cm)
print("\nInterpretation:")
print(f"True Negatives (Did not survive, predicted did not survive): {cm[0, 0]}")
print(f"False Positives (Did not survive, predicted survived): {cm[0, 1]}")
print(f"False Negatives (Survived, predicted did not survive): {cm[1, 0]}")
print(f"True Positives (Survived, predicted survived): {cm[1, 1]}")

## 6. Save the Trained Model

In [None]:
# Save the trained model using joblib
model_path = 'titanic_survival_model.pkl'
joblib.dump(model, model_path)
print(f"Model saved successfully to {model_path}")

# Save the scaler as well (important for preprocessing new data)
scaler_path = 'scaler.pkl'
joblib.dump(scaler, scaler_path)
print(f"Scaler saved successfully to {scaler_path}")

## 7. Verify Model Reloading and Prediction

In [None]:
# Reload the model to demonstrate it works without retraining
loaded_model = joblib.load(model_path)
loaded_scaler = joblib.load(scaler_path)

print("Models reloaded successfully!")

# Test the reloaded model on the same test data
y_test_pred_reloaded = loaded_model.predict(loaded_scaler.transform(X_test))
reloaded_accuracy = accuracy_score(y_test, y_test_pred_reloaded)

print(f"\nAccuracy with reloaded model: {reloaded_accuracy:.4f}")
print("Model reloading verified successfully!")

## 8. Make Predictions on New Passenger Data

In [None]:
# Example: Predict survival for new passengers
# Format: [Pclass, Sex (1=Male, 0=Female), Age, SibSp, Fare]

new_passengers = pd.DataFrame([
    [1, 0, 35, 1, 512.3292],      # 1st class female, 35, with 1 sibling, expensive ticket
    [3, 1, 25, 0, 7.75],          # 3rd class male, 25, no siblings, cheap ticket
    [2, 1, 40, 2, 21.0],          # 2nd class male, 40, with 2 siblings, moderate fare
], columns=['Pclass', 'Sex', 'Age', 'SibSp', 'Fare'])

# Scale the new data
new_passengers_scaled = loaded_scaler.transform(new_passengers)

# Make predictions
predictions = loaded_model.predict(new_passengers_scaled)
probabilities = loaded_model.predict_proba(new_passengers_scaled)

print("\nPredictions for New Passengers:")
print("="*70)
for i, (_, row) in enumerate(new_passengers.iterrows()):
    survival_status = "SURVIVED" if predictions[i] == 1 else "DID NOT SURVIVE"
    survival_prob = probabilities[i][predictions[i]] * 100
    print(f"\nPassenger {i+1}:")
    print(f"  Class: {int(row['Pclass'])}, Sex: {'Male' if row['Sex']==1 else 'Female'}, Age: {row['Age']}, Siblings: {int(row['SibSp'])}, Fare: {row['Fare']:.2f}")
    print(f"  Prediction: {survival_status} (Confidence: {survival_prob:.2f}%)")

# Titanic Survival Prediction System
## Model Development
**Name:** Onafowokan Testament

**Matric:** 21cg029905

**Algorithm:** Logistic Regression

**Features Used:** Pclass, Sex, Age, Fare, Embarked

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')

print('All libraries imported successfully!')

## 2. Load the Titanic Dataset

In [None]:
# Load the Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)

print('Dataset shape:', df.shape)
print('\nFirst few rows:')
print(df.head())
print('\nDataset Info:')
print(df.info())
print('\nMissing Values:')
print(df.isnull().sum())

## 3. Data Preprocessing

In [None]:
# Select features and target variable
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
X = df[features].copy()
y = df['Survived'].copy()

print('Selected Features:', features)
print('Target Variable: Survived')
print('\nDataset shape - X:', X.shape, 'y:', y.shape)

### 3a. Handle Missing Values

In [None]:
# Check for missing values
print('Missing values before handling:')
print(X.isnull().sum())

# Handle missing values
# Age: fill with median
X['Age'].fillna(X['Age'].median(), inplace=True)

# Embarked: fill with mode
X['Embarked'].fillna(X['Embarked'].mode()[0], inplace=True)

# Fare: fill with median
X['Fare'].fillna(X['Fare'].median(), inplace=True)

print('\nMissing values after handling:')
print(X.isnull().sum())

### 3b & 3c. Encode Categorical Variables

In [None]:
# Create a copy for encoding
X_encoded = X.copy()

# Encode Sex (Male=1, Female=0)
X_encoded['Sex'] = (X_encoded['Sex'] == 'male').astype(int)

# Encode Embarked using one-hot encoding
X_encoded = pd.get_dummies(X_encoded, columns=['Embarked'], drop_first=True)

print('Features after encoding:')
print(X_encoded.head())
print('\nFeature columns:', X_encoded.columns.tolist())

### 3d. Feature Scaling

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the features
X_scaled = scaler.fit_transform(X_encoded)

# Convert back to DataFrame to maintain column names
X_scaled = pd.DataFrame(X_scaled, columns=X_encoded.columns)

print('Features after scaling:')
print(X_scaled.head())
print('\nScaling statistics:')
print('Mean of scaled features:', X_scaled.mean().round(4).values)
print('Std of scaled features:', X_scaled.std().round(4).values)

## 4. Train-Test Split

In [None]:
# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print('Training set size:', X_train.shape)
print('Testing set size:', X_test.shape)
print('\nClass distribution in training set:')
print(y_train.value_counts())

## 5. Train Logistic Regression Model

In [None]:
# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

print('Logistic Regression model trained successfully!')
print('Model coefficients:', model.coef_)
print('Model intercept:', model.intercept_)

## 6. Model Evaluation

In [None]:
# Make predictions on training and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate accuracy scores
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print('='*60)
print('MODEL PERFORMANCE')
print('='*60)
print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Testing Accuracy: {test_accuracy:.4f}')
print('\n' + '='*60)
print('CLASSIFICATION REPORT (Test Set)')
print('='*60)
print(classification_report(y_test, y_test_pred, target_names=['Did Not Survive', 'Survived']))

print('\n' + '='*60)
print('CONFUSION MATRIX (Test Set)')
print('='*60)
print(confusion_matrix(y_test, y_test_pred))

## 7. Save the Model

In [None]:
# Save the trained model using joblib
joblib.dump(model, 'titanic_survival_model.pkl')
print('✓ Model saved as: titanic_survival_model.pkl')

# Save the scaler for preprocessing new data
joblib.dump(scaler, 'scaler.pkl')
print('✓ Scaler saved as: scaler.pkl')

## 8. Load and Test the Saved Model

In [None]:
# Load the saved model
loaded_model = joblib.load('titanic_survival_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')

print('✓ Model loaded successfully!')
print('✓ Scaler loaded successfully!')

# Make predictions with the loaded model
y_pred_loaded = loaded_model.predict(X_test)
loaded_model_accuracy = accuracy_score(y_test, y_pred_loaded)

print(f'\nLoaded Model Test Accuracy: {loaded_model_accuracy:.4f}')
print(f'Original Model Test Accuracy: {test_accuracy:.4f}')
print(f'Match: {loaded_model_accuracy == test_accuracy}')

## 9. Example Prediction

In [None]:
# Example: Make a prediction with a new passenger
# Features: Pclass, Sex, Age, Fare, Embarked
new_passenger = pd.DataFrame({
    'Pclass': [3],
    'Sex': ['male'],
    'Age': [25],
    'Fare': [7.75],
    'Embarked': ['S']
})

# Apply the same preprocessing
new_passenger_encoded = new_passenger.copy()
new_passenger_encoded['Sex'] = (new_passenger_encoded['Sex'] == 'male').astype(int)
new_passenger_encoded = pd.get_dummies(new_passenger_encoded, columns=['Embarked'], drop_first=True)

# Ensure all columns match
for col in X_encoded.columns:
    if col not in new_passenger_encoded.columns:
        new_passenger_encoded[col] = 0

new_passenger_encoded = new_passenger_encoded[X_encoded.columns]

# Scale the features
new_passenger_scaled = loaded_scaler.transform(new_passenger_encoded)

# Make prediction
prediction = loaded_model.predict(new_passenger_scaled)[0]
probability = loaded_model.predict_proba(new_passenger_scaled)[0]

print('='*60)
print('EXAMPLE PREDICTION')
print('='*60)
print('Passenger Details:')
print(f'  Pclass: {new_passenger["Pclass"].values[0]}')
print(f'  Sex: {new_passenger["Sex"].values[0]}')
print(f'  Age: {new_passenger["Age"].values[0]}')
print(f'  Fare: {new_passenger["Fare"].values[0]}')
print(f'  Embarked: {new_passenger["Embarked"].values[0]}')
print('\nPrediction Result:')
result = 'SURVIVED' if prediction == 1 else 'DID NOT SURVIVE'
print(f'  Predicted: {result}')
print(f'  Confidence (Did Not Survive): {probability[0]:.4f}')
print(f'  Confidence (Survived): {probability[1]:.4f}')