# Titanic Survival Analysis
This notebook analyzes the Titanic dataset to predict passenger survival using machine learning.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette('husl')

## 1. Load and Explore Data

In [None]:
# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Display basic information about the training data
print("Training Data Shape:", train_data.shape)
print("\nFirst few rows of the training data:")
train_data.head()

In [None]:
# Check missing values
print("Missing values in training data:")
train_data.isnull().sum()

## 2. Data Visualization

In [None]:
# Survival distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=train_data, x='Survived')
plt.title('Distribution of Survival')
plt.show()

# Survival by passenger class
plt.figure(figsize=(10, 5))
sns.countplot(data=train_data, x='Pclass', hue='Survived')
plt.title('Survival by Passenger Class')
plt.show()

In [None]:
# Age distribution
plt.figure(figsize=(10, 5))
sns.histplot(data=train_data, x='Age', hue='Survived', multiple="stack")
plt.title('Age Distribution by Survival')
plt.show()

# Survival by sex
plt.figure(figsize=(8, 5))
sns.countplot(data=train_data, x='Sex', hue='Survived')
plt.title('Survival by Sex')
plt.show()

## 3. Data Preprocessing

In [None]:
def preprocess_data(df):
    # Create a copy to avoid modifying original data
    data = df.copy()
    
    # Fill missing values
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    
    # Convert categorical features
    data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
    data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    # Create title feature from Name
    data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    title_mapping = {
        'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4,
        'Dr': 5, 'Rev': 5, 'Col': 5, 'Major': 5, 'Mlle': 2,
        'Countess': 3, 'Ms': 2, 'Lady': 3, 'Jonkheer': 1,
        'Don': 1, 'Mme': 3, 'Capt': 5, 'Sir': 5
    }
    data['Title'] = data['Title'].map(title_mapping)
    data['Title'].fillna(0, inplace=True)
    
    # Select features for model
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title']
    return data[features]

# Preprocess training data
X = preprocess_data(train_data)
y = train_data['Survived']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Features used:", list(X.columns))

## 4. Model Training and Evaluation

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions
val_predictions = model.predict(X_val_scaled)

# Print results
print("\nModel Performance:")
print(f"Validation Accuracy: {accuracy_score(y_val, val_predictions):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, val_predictions))

## 5. Feature Importance Analysis

In [None]:
# Create feature importance plot
importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=importance)
plt.title('Feature Importance in Survival Prediction')
plt.tight_layout()
plt.show()