In [None]:
# ============================================================================
# TITANIC SURVIVAL PREDICTION SYSTEM - MODEL DEVELOPMENT
# Artificial Intelligence Course Project
# ============================================================================

# ============================================================================
# STEP 1: Import Libraries
# ============================================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report,
                             ConfusionMatrixDisplay)
import joblib
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("TITANIC SURVIVAL PREDICTION SYSTEM - MODEL DEVELOPMENT")
print("="*70)
print("\n‚úÖ All libraries imported successfully!\n")

# ============================================================================
# STEP 2: Load the Dataset
# ============================================================================

print("="*70)
print("STEP 2: Loading Titanic Dataset")
print("="*70)

# Load the dataset
# Download from: https://www.kaggle.com/c/titanic/data
# Or use seaborn's built-in dataset
df = pd.read_csv('train.csv')

print(f"\n‚úÖ Dataset loaded successfully!")
print(f"   Total passengers: {len(df)}")
print(f"   Total features: {df.shape[1]}")

# Display first few rows
print("\nüìä First 5 rows:")
print(df.head())

# Display dataset info
print("\nüìã Dataset Information:")
print(df.info())

# ============================================================================
# STEP 3: Feature Selection (Select 5 from the 7 recommended)
# ============================================================================

print("\n" + "="*70)
print("STEP 3: Feature Selection")
print("="*70)

# Recommended features: Pclass, Sex, Age, SibSp, Parch, Fare, Embarked
# Target: Survived

# Select 5 input features (you can change these)
selected_features = [
    'Pclass',      # Ticket class (1st, 2nd, 3rd)
    'Sex',         # Gender
    'Age',         # Age in years
    'Fare',        # Passenger fare
    'Embarked'     # Port of embarkation (C, Q, S)
]

target = 'Survived'

# Create subset with selected features
df_subset = df[selected_features + [target]].copy()

print(f"\n‚úÖ Selected 5 input features from the 7 recommended:")
for i, feature in enumerate(selected_features, 1):
    print(f"   {i}. {feature}")
print(f"\n   Target variable: {target}")

print(f"\nüìä Subset shape: {df_subset.shape}")

# Target distribution
print("\nüéØ Target Variable Distribution:")
survived_counts = df_subset[target].value_counts().sort_index()
print(survived_counts)
print(f"\n   Did Not Survive (0): {survived_counts[0]} ({survived_counts[0]/len(df_subset)*100:.1f}%)")
print(f"   Survived (1): {survived_counts[1]} ({survived_counts[1]/len(df_subset)*100:.1f}%)")

# ============================================================================
# STEP 4: Data Preprocessing
# ============================================================================

print("\n" + "="*70)
print("STEP 4: Data Preprocessing")
print("="*70)

# 4.1: Check for Missing Values
print("\nüîç Checking for missing values...")
missing_values = df_subset.isnull().sum()
print(missing_values)

# 4.2: Handle Missing Values
print("\nüîß Handling missing values...")

# Age: Fill with median
if df_subset['Age'].isnull().sum() > 0:
    median_age = df_subset['Age'].median()
    df_subset['Age'] = df_subset['Age'].fillna(median_age)
    print(f"   ‚úì Age: Filled {missing_values['Age']} missing values with median ({median_age:.1f})")

# Fare: Fill with median
if df_subset['Fare'].isnull().sum() > 0:
    median_fare = df_subset['Fare'].median()
    df_subset['Fare'] = df_subset['Fare'].fillna(median_fare)
    print(f"   ‚úì Fare: Filled missing values with median")

# Embarked: Fill with mode
if df_subset['Embarked'].isnull().sum() > 0:
    mode_embarked = df_subset['Embarked'].mode()[0]
    df_subset['Embarked'] = df_subset['Embarked'].fillna(mode_embarked)
    print(f"   ‚úì Embarked: Filled {missing_values['Embarked']} missing values with mode ({mode_embarked})")

print(f"\n‚úÖ All missing values handled. Remaining: {df_subset.isnull().sum().sum()}")

# 4.3: Encode Categorical Variables
print("\nüîÑ Encoding categorical variables...")

# Sex: Male=1, Female=0
df_subset['Sex'] = df_subset['Sex'].map({'male': 1, 'female': 0})
print("   ‚úì Sex encoded (male=1, female=0)")

# Embarked: Use Label Encoding
le_embarked = LabelEncoder()
df_subset['Embarked_Encoded'] = le_embarked.fit_transform(df_subset['Embarked'])
print(f"   ‚úì Embarked encoded into {df_subset['Embarked_Encoded'].nunique()} categories")
print(f"      Mapping: {dict(zip(le_embarked.classes_, le_embarked.transform(le_embarked.classes_)))}")

# Save the encoder
import os
os.makedirs('model', exist_ok=True)
joblib.dump(le_embarked, 'model/embarked_encoder.pkl')
print("   ‚úì Embarked encoder saved to 'model/embarked_encoder.pkl'")

# Drop original Embarked column
df_subset = df_subset.drop('Embarked', axis=1)

# 4.4: Separate Features and Target
X = df_subset.drop('Survived', axis=1)
y = df_subset['Survived']

print(f"\n‚úÖ Features (X): {X.shape}")
print(f"‚úÖ Target (y): {y.shape}")
print(f"\n   Final features: {list(X.columns)}")

# Display preprocessed data
print("\nüìä Preprocessed Data (first 5 rows):")
print(X.head())

# ============================================================================
# STEP 5: Exploratory Data Analysis
# ============================================================================

print("\n" + "="*70)
print("STEP 5: Exploratory Data Analysis")
print("="*70)

# Statistical summary
print("\nüìä Statistical Summary:")
print(df_subset.describe())

# Correlation analysis
print("\nüìà Correlation with Survival:")
correlation = df_subset.corr()['Survived'].sort_values(ascending=False)
print(correlation)

# Visualizations
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Feature Distributions by Survival Status', 
             fontsize=16, fontweight='bold')

features_to_plot = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked_Encoded']

for idx, feature in enumerate(features_to_plot):
    ax = axes[idx // 3, idx % 3]
    
    # Create survival vs non-survival distributions
    survived = df_subset[df_subset['Survived'] == 1][feature]
    not_survived = df_subset[df_subset['Survived'] == 0][feature]
    
    ax.hist([not_survived, survived], bins=20, label=['Did Not Survive', 'Survived'],
            color=['red', 'green'], alpha=0.7, edgecolor='black')
    ax.set_xlabel(feature, fontweight='bold')
    ax.set_ylabel('Frequency', fontweight='bold')
    ax.set_title(f'{feature} Distribution')
    ax.legend()
    ax.grid(True, alpha=0.3)

# Remove empty subplot
axes[1, 2].remove()

plt.tight_layout()
plt.show()

# Survival rate by feature
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Survival Rates by Key Features', fontsize=16, fontweight='bold')

# By Pclass
pclass_survival = df_subset.groupby('Pclass')['Survived'].mean()
axes[0].bar(pclass_survival.index, pclass_survival.values, color='steelblue', edgecolor='black')
axes[0].set_xlabel('Passenger Class', fontweight='bold')
axes[0].set_ylabel('Survival Rate', fontweight='bold')
axes[0].set_title('Survival Rate by Class')
axes[0].set_ylim([0, 1])
axes[0].grid(True, alpha=0.3, axis='y')

# By Sex
sex_survival = df_subset.groupby('Sex')['Survived'].mean()
axes[1].bar(['Female', 'Male'], sex_survival.values, color='coral', edgecolor='black')
axes[1].set_xlabel('Gender', fontweight='bold')
axes[1].set_ylabel('Survival Rate', fontweight='bold')
axes[1].set_title('Survival Rate by Gender')
axes[1].set_ylim([0, 1])
axes[1].grid(True, alpha=0.3, axis='y')

# By Embarked
embarked_survival = df_subset.groupby('Embarked_Encoded')['Survived'].mean()
axes[2].bar(le_embarked.classes_, embarked_survival.values, color='lightgreen', edgecolor='black')
axes[2].set_xlabel('Port of Embarkation', fontweight='bold')
axes[2].set_ylabel('Survival Rate', fontweight='bold')
axes[2].set_title('Survival Rate by Embarkation Port')
axes[2].set_ylim([0, 1])
axes[2].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# ============================================================================
# STEP 6: Train-Test Split
# ============================================================================

print("\n" + "="*70)
print("STEP 6: Train-Test Split")
print("="*70)

# Split data: 80% train, 20% test with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n‚úÖ Data split completed (80-20 with stratification)")
print(f"   Training samples: {len(X_train)}")
print(f"   Testing samples: {len(X_test)}")

print(f"\n   Train class distribution:")
train_dist = y_train.value_counts().sort_index()
print(f"      Did Not Survive: {train_dist[0]} ({train_dist[0]/len(y_train)*100:.1f}%)")
print(f"      Survived: {train_dist[1]} ({train_dist[1]/len(y_train)*100:.1f}%)")

# ============================================================================
# STEP 7: Feature Scaling
# ============================================================================

print("\n" + "="*70)
print("STEP 7: Feature Scaling")
print("="*70)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n‚úÖ Features scaled using StandardScaler")
print("   Note: Scaling improves model performance for distance-based algorithms")

# Save the scaler
joblib.dump(scaler, 'model/scaler.pkl')
print("   ‚úì Scaler saved to 'model/scaler.pkl'")

# ============================================================================
# STEP 8: Model Training - Random Forest Classifier
# ============================================================================

print("\n" + "="*70)
print("STEP 8: Model Training - Random Forest Classifier")
print("="*70)

# Create and train Random Forest model
model = RandomForestClassifier(
    n_estimators=100,      # Number of trees
    max_depth=10,          # Maximum depth of trees
    min_samples_split=5,   # Minimum samples to split
    random_state=42,
    n_jobs=-1              # Use all CPU cores
)

print("\n‚è≥ Training Random Forest Classifier...")
model.fit(X_train_scaled, y_train)
print("‚úÖ Model training completed!")

# Display model parameters
print("\nüìã Model Parameters:")
print(f"   Algorithm: Random Forest Classifier")
print(f"   Number of trees: {model.n_estimators}")
print(f"   Max depth: {model.max_depth}")
print(f"   Random state: {model.random_state}")

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nüìä Feature Importance:")
print(feature_importance.to_string(index=False))

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'],
         color='teal', edgecolor='black')
plt.xlabel('Importance', fontweight='bold', fontsize=12)
plt.ylabel('Feature', fontweight='bold', fontsize=12)
plt.title('Feature Importance in Random Forest Model', fontweight='bold', fontsize=14)
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

# ============================================================================
# STEP 9: Model Evaluation
# ============================================================================

print("\n" + "="*70)
print("STEP 9: Model Evaluation")
print("="*70)

# Make predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Calculate metrics for training set
train_acc = accuracy_score(y_train, y_train_pred)
train_prec = precision_score(y_train, y_train_pred)
train_rec = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

# Calculate metrics for testing set
test_acc = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred)
test_rec = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Print metrics
print("\nüìä TRAINING SET METRICS:")
print("-" * 70)
print(f"   Accuracy  : {train_acc:.4f} ({train_acc*100:.2f}%)")
print(f"   Precision : {train_prec:.4f}")
print(f"   Recall    : {train_rec:.4f}")
print(f"   F1-Score  : {train_f1:.4f}")

print("\nüìä TESTING SET METRICS:")
print("-" * 70)
print(f"   Accuracy  : {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"   Precision : {test_prec:.4f}")
print(f"   Recall    : {test_rec:.4f}")
print(f"   F1-Score  : {test_f1:.4f}")

# Classification Report
print("\nüìã DETAILED CLASSIFICATION REPORT (Test Set):")
print("-" * 70)
print(classification_report(y_test, y_test_pred, 
                          target_names=['Did Not Survive', 'Survived']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confusion Matrix - Test Set
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=['Did Not Survive', 'Survived'])
disp.plot(cmap='Blues', ax=axes[0], values_format='d')
axes[0].set_title('Confusion Matrix - Test Set', fontweight='bold', fontsize=14)
axes[0].grid(False)

# Confusion Matrix - Normalized
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
disp_norm = ConfusionMatrixDisplay(confusion_matrix=cm_normalized,
                                   display_labels=['Did Not Survive', 'Survived'])
disp_norm.plot(cmap='Greens', ax=axes[1], values_format='.2f')
axes[1].set_title('Confusion Matrix - Normalized', fontweight='bold', fontsize=14)
axes[1].grid(False)

plt.tight_layout()
plt.show()

# Interpretation
print("\nüí° MODEL INTERPRETATION:")
print(f"   ‚Ä¢ The model correctly predicts survival with {test_acc*100:.1f}% accuracy")
print(f"   ‚Ä¢ Precision of {test_prec:.2f} means {test_prec*100:.1f}% of predicted survivors actually survived")
print(f"   ‚Ä¢ Recall of {test_rec:.2f} means {test_rec*100:.1f}% of actual survivors were identified")
print(f"   ‚Ä¢ F1-Score of {test_f1:.2f} balances precision and recall")

# ============================================================================
# STEP 10: Save the Model
# ============================================================================

print("\n" + "="*70)
print("STEP 10: Save the Trained Model")
print("="*70)

# Save the model using joblib
model_filename = 'model/titanic_survival_model.pkl'
joblib.dump(model, model_filename)

print(f"\n‚úÖ Model saved successfully!")
print(f"   Location: {model_filename}")
print(f"   Method: Joblib")
print(f"   File size: {os.path.getsize(model_filename) / 1024:.2f} KB")

# ============================================================================
# STEP 11: Test Model Reload
# ============================================================================

print("\n" + "="*70)
print("STEP 11: Demonstrate Model Reload Without Retraining")
print("="*70)

print("\nüîÑ Testing model reload...")

# Reload the model
loaded_model = joblib.load(model_filename)
loaded_scaler = joblib.load('model/scaler.pkl')
loaded_encoder = joblib.load('model/embarked_encoder.pkl')

print("‚úÖ Model, scaler, and encoder reloaded successfully!")

# Make sample predictions
print("\nüìä Sample Predictions (First 5 Passengers in Test Set):")
print("-" * 70)

sample_predictions = loaded_model.predict(X_test_scaled[:5])

for i in range(5):
    actual = y_test.iloc[i]
    predicted = sample_predictions[i]
    passenger_data = X_test.iloc[i]
    
    result = "‚úì CORRECT" if actual == predicted else "‚úó INCORRECT"
    
    print(f"\nPassenger {i+1}:")
    print(f"   Class: {int(passenger_data['Pclass'])}, Sex: {'Male' if passenger_data['Sex']==1 else 'Female'}, Age: {passenger_data['Age']:.1f}")
    print(f"   Fare: ${passenger_data['Fare']:.2f}")
    print(f"   Actual: {'Survived' if actual==1 else 'Did Not Survive'}")
    print(f"   Predicted: {'Survived' if predicted==1 else 'Did Not Survive'}")
    print(f"   {result}")

# ============================================================================
# STEP 12: Save Model Configuration
# ============================================================================

print("\n" + "="*70)
print("STEP 12: Save Model Configuration")
print("="*70)

# Save feature names and model configuration
model_config = {
    'feature_names': list(X.columns),
    'selected_features': selected_features,
    'model_type': 'RandomForestClassifier',
    'metrics': {
        'test_accuracy': test_acc,
        'test_precision': test_prec,
        'test_recall': test_rec,
        'test_f1': test_f1
    }
}

joblib.dump(model_config, 'model/model_config.pkl')
print("‚úÖ Model configuration saved to 'model/model_config.pkl'")

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("\n\n" + "="*70)
print("MODEL DEVELOPMENT COMPLETED SUCCESSFULLY! ‚úÖ")
print("="*70)

print("\nüì¶ SAVED FILES:")
print("   1. model/titanic_survival_model.pkl   - Trained Random Forest model")
print("   2. model/scaler.pkl                    - Feature scaler")
print("   3. model/embarked_encoder.pkl          - Embarked port encoder")
print("   4. model/model_config.pkl              - Model configuration")

print("\nüìä FINAL MODEL PERFORMANCE:")
print(f"   Algorithm: Random Forest Classifier")
print(f"   Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"   Precision: {test_prec:.4f}")
print(f"   Recall: {test_rec:.4f}")
print(f"   F1-Score: {test_f1:.4f}")

print("\nüéØ NEXT STEPS:")
print("   1. Build web GUI (app.py + index.html)")
print("   2. Test the application locally")
print("   3. Upload to GitHub")
print("   4. Deploy to Render/PythonAnywhere/Streamlit Cloud")

print("\n" + "="*70)