# Titanic Dataset Analysis

This notebook contains a comprehensive analysis of the Titanic dataset, exploring various aspects of passenger survival rates and factors that influenced survival.

In [None]:
# Import necessary libraries
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from src.data_processing import load_data, clean_data, feature_engineering, prepare_features
from src.visualization import *

# Set random seed for reproducibility
np.random.seed(42)

## 1. Data Loading and Initial Exploration

In [None]:
# Load the dataset
df = load_data('../data/raw/titanic.csv')

# Display basic information about the dataset
print("Dataset Info:")
df.info()

print("\nFirst few rows:")
df.head()

## 2. Data Cleaning and Preprocessing

In [None]:
# Clean the data
df_clean = clean_data(df)

# Create new features
df_featured = feature_engineering(df_clean)

print("Missing values after cleaning:")
df_featured.isnull().sum()

## 3. Exploratory Data Analysis

In [None]:
# Set the plotting style
set_plotting_style()

# Plot survival rates by different features
plot_survival_by_feature(df_featured, 'Pclass', 'Survival Rate by Passenger Class')
plt.show()

plot_survival_by_feature(df_featured, 'Sex', 'Survival Rate by Gender')
plt.show()

plot_age_distribution(df_featured)
plt.show()

plot_survival_by_class_and_sex(df_featured)
plt.show()

## 4. Feature Correlation Analysis

In [None]:
# Plot correlation matrix
plot_correlation_matrix(df_featured)
plt.show()

## 5. Model Training and Evaluation

In [None]:
# Prepare features for modeling
X, y, scaler = prepare_features(df_featured)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Print model performance
print("Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot feature importance
plot_feature_importance(rf_model.feature_importances_, X.columns)
plt.show()

## 6. Conclusions

Based on our analysis, we can draw the following conclusions:
1. Gender was the most important factor in survival
2. Passenger class had a significant impact on survival rates
3. Age played a role in survival, with children having higher survival rates
4. Family size and whether a passenger was traveling alone affected survival chances