# Titanic Survival Prediction - Model Development

## Project Information
- **Algorithm**: Random Forest Classifier
- **Features Used**: Pclass, Sex, Age, Fare, Embarked
- **Model Persistence**: Joblib

---

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("✓ All libraries imported successfully")

## 2. Load Dataset

In [None]:
# Load the Titanic dataset
# Note: Download from https://www.kaggle.com/c/titanic/data
try:
    df = pd.read_csv('train.csv')
    print(f"✓ Dataset loaded successfully")
    print(f"Shape: {df.shape}")
    print(f"\nFirst few rows:")
    display(df.head())
except FileNotFoundError:
    print("❌ Error: 'train.csv' not found. Please download from Kaggle.")
    print("URL: https://www.kaggle.com/c/titanic/data")

## 3. Exploratory Data Analysis

In [None]:
# Display basic information
print("Dataset Info:")
print("=" * 50)
df.info()

print("\n" + "=" * 50)
print("Statistical Summary:")
print("=" * 50)
display(df.describe())

print("\n" + "=" * 50)
print("Missing Values:")
print("=" * 50)
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percent
}).sort_values('Missing Count', ascending=False)
print(missing_df[missing_df['Missing Count'] > 0])

In [None]:
# Survival rate distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
survival_counts = df['Survived'].value_counts()
axes[0].bar(['Did Not Survive', 'Survived'], survival_counts.values, color=['#FF6B6B', '#4ECDC4'])
axes[0].set_ylabel('Count')
axes[0].set_title('Survival Distribution')
axes[0].grid(axis='y', alpha=0.3)

for i, v in enumerate(survival_counts.values):
    axes[0].text(i, v + 10, str(v), ha='center', fontweight='bold')

# Pie chart
axes[1].pie(survival_counts.values, labels=['Did Not Survive', 'Survived'], 
            autopct='%1.1f%%', startangle=90, colors=['#FF6B6B', '#4ECDC4'])
axes[1].set_title('Survival Percentage')

plt.tight_layout()
plt.show()

print(f"Survival Rate: {df['Survived'].mean()*100:.2f}%")

## 4. Feature Selection and Data Preprocessing

### Selected Features:
1. **Pclass** - Passenger Class (1st, 2nd, 3rd)
2. **Sex** - Gender
3. **Age** - Age in years
4. **Fare** - Ticket fare
5. **Embarked** - Port of Embarkation (C, Q, S)

In [None]:
# Select relevant features
FEATURES = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
TARGET = 'Survived'

# Create working dataframe
data = df[FEATURES + [TARGET]].copy()

print(f"Selected Features: {FEATURES}")
print(f"Target Variable: {TARGET}")
print(f"\nShape before preprocessing: {data.shape}")
print(f"\nMissing values in selected features:")
print(data.isnull().sum())

### 4.1 Handle Missing Values

In [None]:
# Handle missing values
print("Handling missing values...\n")

# Age: Fill with median grouped by Pclass and Sex
data['Age'] = data.groupby(['Pclass', 'Sex'])['Age'].transform(
    lambda x: x.fillna(x.median())
)
print(f"✓ Age: Filled {df['Age'].isnull().sum()} missing values with grouped median")

# Embarked: Fill with mode (most common port)
embarked_mode = data['Embarked'].mode()[0]
embarked_missing = data['Embarked'].isnull().sum()
data['Embarked'].fillna(embarked_mode, inplace=True)
print(f"✓ Embarked: Filled {embarked_missing} missing values with mode '{embarked_mode}'")

# Fare: Fill with median (if any missing)
fare_missing = data['Fare'].isnull().sum()
if fare_missing > 0:
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    print(f"✓ Fare: Filled {fare_missing} missing values with median")

print(f"\n✓ Missing values handled successfully")
print(f"Remaining missing values: {data.isnull().sum().sum()}")

### 4.2 Encode Categorical Variables

In [None]:
# Encode categorical features
print("Encoding categorical variables...\n")

# Sex: male=1, female=0
data['Sex'] = data['Sex'].map({'male': 1, 'female': 0})
print(f"✓ Sex: Encoded (male=1, female=0)")

# Embarked: Use LabelEncoder
le_embarked = LabelEncoder()
data['Embarked'] = le_embarked.fit_transform(data['Embarked'])
embarked_mapping = dict(zip(le_embarked.classes_, le_embarked.transform(le_embarked.classes_)))
print(f"✓ Embarked: Encoded {embarked_mapping}")

print(f"\n✓ Categorical encoding completed")
display(data.head())

### 4.3 Feature Scaling

In [None]:
# Separate features and target
X = data[FEATURES]
y = data[TARGET]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nClass distribution:")
print(y.value_counts())
print(f"\nClass balance: {y.value_counts(normalize=True).to_dict()}")

In [None]:
# Apply StandardScaler to numerical features
scaler = StandardScaler()
numerical_features = ['Age', 'Fare']

X_scaled = X.copy()
X_scaled[numerical_features] = scaler.fit_transform(X[numerical_features])

print("✓ Feature scaling applied to numerical features")
print(f"Scaled features: {numerical_features}")
print(f"\nScaled data sample:")
display(X_scaled.head())

## 5. Train-Test Split

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"\nTraining set class distribution:")
print(y_train.value_counts())
print(f"\nTesting set class distribution:")
print(y_test.value_counts())

## 6. Model Training - Random Forest Classifier

In [None]:
# Initialize Random Forest Classifier
print("Training Random Forest Classifier...\n")

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

# Train the model
rf_model.fit(X_train, y_train)

print("✓ Model training completed successfully")

## 7. Model Evaluation

In [None]:
# Make predictions
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)
y_test_proba = rf_model.predict_proba(X_test)[:, 1]

# Calculate accuracies
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("=" * 70)
print("MODEL PERFORMANCE")
print("=" * 70)
print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Testing Accuracy:  {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"Difference:        {abs(train_accuracy - test_accuracy):.4f}")

if abs(train_accuracy - test_accuracy) < 0.05:
    print("\n✓ Model shows good generalization (low overfitting)")
elif train_accuracy > test_accuracy + 0.1:
    print("\n⚠ Model may be overfitting")
else:
    print("\n✓ Model performance is acceptable")

In [None]:
# Classification Report
print("\n" + "=" * 70)
print("CLASSIFICATION REPORT (Test Set)")
print("=" * 70)
print(classification_report(y_test, y_test_pred, 
                          target_names=['Did Not Survive', 'Survived'],
                          digits=4))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Did Not Survive', 'Survived'],
            yticklabels=['Did Not Survive', 'Survived'],
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)
plt.tight_layout()
plt.show()

# Calculate metrics from confusion matrix
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives:  {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives:  {tp}")

In [None]:
# ROC Curve and AUC Score
roc_auc = roc_auc_score(y_test, y_test_proba)
fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='#4ECDC4', lw=2, label=f'ROC Curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='#FF6B6B', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('Receiver Operating Characteristic (ROC) Curve', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nAUC-ROC Score: {roc_auc:.4f}")

In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'Feature': FEATURES,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n" + "=" * 70)
print("FEATURE IMPORTANCE")
print("=" * 70)
print(feature_importance.to_string(index=False))

# Visualize feature importance
plt.figure(figsize=(10, 6))
colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(FEATURES)))
bars = plt.barh(feature_importance['Feature'], feature_importance['Importance'], color=colors)
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title('Feature Importance in Random Forest Model', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()

# Add value labels on bars
for i, (bar, importance) in enumerate(zip(bars, feature_importance['Importance'])):
    plt.text(importance + 0.005, bar.get_y() + bar.get_height()/2, 
             f'{importance:.4f}', va='center', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Cross-validation
print("\n" + "=" * 70)
print("CROSS-VALIDATION RESULTS (5-Fold)")
print("=" * 70)

cv_scores = cross_val_score(rf_model, X_scaled, y, cv=5, scoring='accuracy')

print(f"Cross-validation scores: {cv_scores}")
print(f"\nMean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"Min CV Accuracy:  {cv_scores.min():.4f}")
print(f"Max CV Accuracy:  {cv_scores.max():.4f}")

## 8. Save Model and Preprocessing Objects

In [None]:
# Prepare model package with all necessary components
model_package = {
    'model': rf_model,
    'scaler': scaler,
    'features': FEATURES,
    'numerical_features': numerical_features,
    'embarked_encoder': le_embarked,
    'test_accuracy': test_accuracy,
    'train_accuracy': train_accuracy,
    'roc_auc': roc_auc
}

# Save model using Joblib
model_filename = 'titanic_survival_model.pkl'
joblib.dump(model_package, model_filename)

print("=" * 70)
print("MODEL SAVED SUCCESSFULLY")
print("=" * 70)
print(f"✓ File: {model_filename}")
print(f"✓ Method: Joblib")
print(f"✓ Components saved:")
for key in model_package.keys():
    print(f"  - {key}")

# Get file size
import os
file_size = os.path.getsize(model_filename) / 1024  # KB
print(f"\n✓ File size: {file_size:.2f} KB")

## 9. Demonstrate Model Reloading and Prediction

In [None]:
# Load the saved model
print("Loading saved model...\n")
loaded_model_package = joblib.load(model_filename)

loaded_model = loaded_model_package['model']
loaded_scaler = loaded_model_package['scaler']
loaded_features = loaded_model_package['features']

print("✓ Model loaded successfully")
print(f"✓ Model type: {type(loaded_model).__name__}")
print(f"✓ Features: {loaded_features}")
print(f"✓ Test accuracy from training: {loaded_model_package['test_accuracy']:.4f}")

In [None]:
# Verify model works without retraining
print("\n" + "=" * 70)
print("VERIFICATION: Model Prediction Without Retraining")
print("=" * 70)

# Use test set to verify
verification_pred = loaded_model.predict(X_test)
verification_accuracy = accuracy_score(y_test, verification_pred)

print(f"\nOriginal test accuracy:  {test_accuracy:.4f}")
print(f"Reloaded model accuracy: {verification_accuracy:.4f}")

if abs(test_accuracy - verification_accuracy) < 0.0001:
    print("\n✓ SUCCESS: Model produces identical predictions after reloading")
else:
    print("\n⚠ WARNING: Model predictions differ after reloading")

## 10. Sample Predictions

In [None]:
# Function to make prediction for new passenger
def predict_survival(pclass, sex, age, fare, embarked):
    """
    Predict survival for a passenger.
    
    Parameters:
    - pclass: int (1, 2, or 3)
    - sex: str ('male' or 'female')
    - age: float
    - fare: float
    - embarked: str ('C', 'Q', or 'S')
    
    Returns:
    - prediction: int (0 or 1)
    - probability: float (probability of survival)
    """
    # Encode sex
    sex_encoded = 1 if sex.lower() == 'male' else 0
    
    # Encode embarked
    try:
        embarked_encoded = loaded_model_package['embarked_encoder'].transform([embarked.upper()])[0]
    except:
        embarked_encoded = 0  # Default to first class
    
    # Create feature array
    features_array = np.array([[pclass, sex_encoded, age, fare, embarked_encoded]])
    features_df = pd.DataFrame(features_array, columns=loaded_features)
    
    # Scale numerical features
    features_df[loaded_model_package['numerical_features']] = loaded_scaler.transform(
        features_df[loaded_model_package['numerical_features']]
    )
    
    # Make prediction
    prediction = loaded_model.predict(features_df)[0]
    probability = loaded_model.predict_proba(features_df)[0]
    
    return prediction, probability

print("✓ Prediction function defined")

In [None]:
# Test with sample passengers
print("=" * 70)
print("SAMPLE PREDICTIONS")
print("=" * 70)

test_cases = [
    {"name": "Rich Young Woman (1st Class)", "pclass": 1, "sex": "female", "age": 25, "fare": 100, "embarked": "C"},
    {"name": "Poor Old Man (3rd Class)", "pclass": 3, "sex": "male", "age": 60, "fare": 8, "embarked": "S"},
    {"name": "Middle-aged Woman (2nd Class)", "pclass": 2, "sex": "female", "age": 35, "fare": 30, "embarked": "Q"},
    {"name": "Young Man (1st Class)", "pclass": 1, "sex": "male", "age": 30, "fare": 50, "embarked": "S"},
    {"name": "Child (3rd Class)", "pclass": 3, "sex": "female", "age": 5, "fare": 15, "embarked": "S"},
]

for i, case in enumerate(test_cases, 1):
    prediction, probability = predict_survival(
        case['pclass'], case['sex'], case['age'], case['fare'], case['embarked']
    )
    
    result = "✓ SURVIVED" if prediction == 1 else "✗ DID NOT SURVIVE"
    color = "\033[92m" if prediction == 1 else "\033[91m"  # Green or Red
    reset = "\033[0m"
    
    print(f"\n{i}. {case['name']}")
    print(f"   Pclass: {case['pclass']}, Sex: {case['sex']}, Age: {case['age']}, Fare: ${case['fare']}, Embarked: {case['embarked']}")
    print(f"   Prediction: {color}{result}{reset}")
    print(f"   Probability: {probability[1]*100:.2f}% survival, {probability[0]*100:.2f}% death")

print("\n" + "=" * 70)

## 11. Model Summary

In [None]:
print("\n" + "=" * 70)
print("MODEL SUMMARY")
print("=" * 70)
print(f"Algorithm:              Random Forest Classifier")
print(f"Number of Trees:        {rf_model.n_estimators}")
print(f"Max Depth:              {rf_model.max_depth}")
print(f"Features Used:          {', '.join(FEATURES)}")
print(f"Training Samples:       {len(X_train)}")
print(f"Testing Samples:        {len(X_test)}")
print(f"\nPerformance Metrics:")
print(f"  Training Accuracy:    {train_accuracy:.4f}")
print(f"  Testing Accuracy:     {test_accuracy:.4f}")
print(f"  Cross-Val Accuracy:   {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"  AUC-ROC Score:        {roc_auc:.4f}")
print(f"\nModel Persistence:      Joblib")
print(f"Model File:             {model_filename}")
print(f"File Size:              {file_size:.2f} KB")
print("=" * 70)
print("\n✓ Model development completed successfully!")
print("✓ Ready for deployment in web application")