# Titanic - Machine Learning from Disaster

## Overview
This notebook contains a comprehensive analysis of the Titanic dataset, implementing various machine learning techniques to predict passenger survival. We'll follow a structured approach to solve this problem, going through data exploration, preprocessing, feature engineering, model selection, and optimization.

## Table of Contents
1. [Data Exploration and Visualization](#1.-Data-Exploration-and-Visualization)
2. [Data Cleaning and Preprocessing](#2.-Data-Cleaning-and-Preprocessing)
3. [Feature Engineering](#3.-Feature-Engineering)
4. [Model Selection and Training](#4.-Model-Selection-and-Training)
5. [Model Optimization](#5.-Model-Optimization)
6. [Testing and Submission](#6.-Testing-and-Submission)

## Setup
First, let's import all necessary libraries and set up our environment.

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix

# Settings
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')
%matplotlib inline

# Set random seed for reproducibility
np.random.seed(42)

# 1. Data Exploration and Visualization

In this section, we'll:
1. Load the dataset
2. Analyze basic statistics
3. Visualize relationships between features

In [None]:
# Load the datasets
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

print("Training set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# Display first few rows
train_df.head()

In [None]:
# Basic information about the dataset
print("Dataset Info:")
train_df.info()

print("\nBasic statistics:")
train_df.describe()

In [None]:
# Check missing values
missing_values = pd.DataFrame({
    'Missing Values': train_df.isnull().sum(),
    'Percentage': (train_df.isnull().sum() / len(train_df)) * 100
})
print("Missing Values Analysis:")
missing_values

### Visualization of Key Features

In [None]:
# Survival rate by gender
plt.figure(figsize=(10, 6))
sns.barplot(x='Sex', y='Survived', data=train_df)
plt.title('Survival Rate by Gender')
plt.show()

In [None]:
# Survival rate by passenger class
plt.figure(figsize=(10, 6))
sns.barplot(x='Pclass', y='Survived', data=train_df)
plt.title('Survival Rate by Passenger Class')
plt.show()

In [None]:
# Age distribution
plt.figure(figsize=(12, 6))
sns.histplot(data=train_df, x='Age', hue='Survived', multiple="stack", bins=30)
plt.title('Age Distribution by Survival Status')
plt.show()

In [None]:
# Correlation matrix
numeric_cols = train_df.select_dtypes(include=[np.number]).columns
correlation_matrix = train_df[numeric_cols].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

# 2. Data Cleaning and Preprocessing

In this section, we'll:
1. Handle missing values
2. Encode categorical variables
3. Scale numerical features
4. Split the dataset

In [None]:
def preprocess_data(df, is_training=True):
    # Create a copy of the dataframe
    df_processed = df.copy()
    
    # Handle missing values
    # Age: Fill with median age
    df_processed['Age'].fillna(df_processed['Age'].median(), inplace=True)
    
    # Embarked: Fill with most common value
    df_processed['Embarked'].fillna(df_processed['Embarked'].mode()[0], inplace=True)
    
    # Fare: Fill with median fare
    df_processed['Fare'].fillna(df_processed['Fare'].median(), inplace=True)
    
    # Encode categorical variables
    # Sex: Convert to numeric
    df_processed['Sex'] = df_processed['Sex'].map({'male': 0, 'female': 1})
    
    # Embarked: Convert to numeric
    embarked_mapping = {'S': 0, 'C': 1, 'Q': 2}
    df_processed['Embarked'] = df_processed['Embarked'].map(embarked_mapping)
    
    # Drop unnecessary columns
    columns_to_drop = ['Name', 'Ticket', 'Cabin', 'PassengerId']
    df_processed.drop(columns=columns_to_drop, inplace=True)
    
    return df_processed

# Preprocess training and test data
train_processed = preprocess_data(train_df)
test_processed = preprocess_data(test_df, is_training=False)

# Split features and target for training data
X = train_processed.drop('Survived', axis=1)
y = train_processed['Survived']

# Scale numerical features
scaler = StandardScaler()
numerical_features = ['Age', 'Fare']
X[numerical_features] = scaler.fit_transform(X[numerical_features])
test_processed[numerical_features] = scaler.transform(test_processed[numerical_features])

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)

# 3. Feature Engineering

In this section, we'll create new features and analyze their importance.

In [None]:
def engineer_features(df):
    df_engineered = df.copy()
    
    # Create family size feature
    df_engineered['FamilySize'] = df_engineered['SibSp'] + df_engineered['Parch'] + 1
    
    # Create is_alone feature
    df_engineered['IsAlone'] = (df_engineered['FamilySize'] == 1).astype(int)
    
    # Create fare per person feature
    df_engineered['FarePerPerson'] = df_engineered['Fare'] / df_engineered['FamilySize']
    
    # Create age groups
    df_engineered['AgeGroup'] = pd.cut(df_engineered['Age'], 
                                       bins=[0, 12, 18, 35, 50, 100],
                                       labels=[0, 1, 2, 3, 4])
    
    return df_engineered

# Apply feature engineering
X_train_engineered = engineer_features(X_train)
X_val_engineered = engineer_features(X_val)
test_engineered = engineer_features(test_processed)

# Scale new numerical features
new_numerical_features = ['FamilySize', 'FarePerPerson']
scaler_new = StandardScaler()
X_train_engineered[new_numerical_features] = scaler_new.fit_transform(X_train_engineered[new_numerical_features])
X_val_engineered[new_numerical_features] = scaler_new.transform(X_val_engineered[new_numerical_features])
test_engineered[new_numerical_features] = scaler_new.transform(test_engineered[new_numerical_features])

# Feature importance analysis using Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_engineered, y_train)

feature_importance = pd.DataFrame({
    'Feature': X_train_engineered.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance Analysis')
plt.show()

# 4. Model Selection and Training

We'll train and evaluate three different models:
1. Logistic Regression
2. Random Forest
3. Support Vector Machine

In [None]:
def evaluate_model(model, X_train, X_val, y_train, y_val):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_pred)
    
    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    }

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

# Evaluate each model
results = {}
for name, model in models.items():
    print(f"\nEvaluating {name}...")
    results[name] = evaluate_model(model, X_train_engineered, X_val_engineered, y_train, y_val)
    
# Display results
results_df = pd.DataFrame(results).round(3)
print("\nModel Comparison:")
print(results_df)

# 5. Model Optimization

We'll perform hyperparameter tuning for each model using GridSearchCV.

In [None]:
# Define parameter grids for each model
param_grids = {
    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    }
}

# Perform grid search for each model
optimized_models = {}
optimized_results = {}

for name, model in models.items():
    print(f"\nOptimizing {name}...")
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_engineered, y_train)
    
    optimized_models[name] = grid_search.best_estimator_
    optimized_results[name] = evaluate_model(grid_search.best_estimator_,
                                            X_train_engineered,
                                            X_val_engineered,
                                            y_train,
                                            y_val)
    
    print(f"Best parameters: {grid_search.best_params_}")

# Display optimized results
optimized_results_df = pd.DataFrame(optimized_results).round(3)
print("\nOptimized Model Comparison:")
print(optimized_results_df)

# 6. Testing and Submission

We'll use the best performing model to make predictions on the test set.

In [None]:
# Find the best model based on validation accuracy
best_model_name = max(optimized_results, key=lambda k: optimized_results[k]['Accuracy'])
best_model = optimized_models[best_model_name]

print(f"Best performing model: {best_model_name}")

# Make predictions on test set
test_predictions = best_model.predict(test_engineered)

# Create submission file
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_predictions
})

# Save predictions
submission.to_csv('../Prince_submission.csv', index=False)
print("\nSubmission file has been created!")