# Titanic Dataset EDA Notebook

This notebook performs full Exploratory Data Analysis on the Titanic dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Option A: Load seaborn's built-in Titanic dataset
df = sns.load_dataset('titanic')

print("Shape:", df.shape)
df.head()

In [None]:
print("=== INFO ===")
display(df.info())

print("\n=== DESCRIBE (NUMERIC) ===")
display(df.describe())

print("\n=== DESCRIBE (ALL) ===")
display(df.describe(include='all'))

In [None]:
missing = df.isnull().sum().sort_values(ascending=False)
missing_percent = (df.isnull().mean()*100).sort_values(ascending=False)
pd.DataFrame({'missing_count': missing, 'missing_percent': missing_percent}).head(20)

In [None]:
for col in ['survived','pclass','sex','embarked','who','deck']:
    if col in df.columns:
        print(f"\n--- {col} ---")
        display(df[col].value_counts(dropna=False))

In [None]:
if 'survived' in df.columns:
    print("Survival rate overall:", df['survived'].mean())
    for col in ['sex','pclass','who','embarked','deck']:
        if col in df.columns:
            print(f"\nSurvival rate by {col}:")
            display(df.groupby(col)['survived'].mean().sort_values(ascending=False))

In [None]:
num = df.select_dtypes(include=['number'])
corr = num.corr()
display(corr)

plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='vlag', center=0)
plt.title("Numeric correlation matrix")
plt.show()

In [None]:
fig, axes = plt.subplots(2,2, figsize=(12,8))
axes = axes.flatten()
cols = ['age','fare','sibsp','parch']
for ax, c in zip(axes, cols):
    if c in df.columns:
        sns.histplot(df[c].dropna(), ax=ax, kde=False)
        ax.set_title(f'Distribution of {c}')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,4))
if 'pclass' in df.columns and 'fare' in df.columns:
    plt.subplot(1,2,1)
    sns.boxplot(x='pclass', y='fare', data=df)
    plt.title('Fare by Pclass')

if 'survived' in df.columns and 'age' in df.columns:
    plt.subplot(1,2,2)
    sns.boxplot(x='survived', y='age', data=df)
    plt.title('Age by Survival')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.barplot(x='sex', y='survived', data=df)
plt.title('Survival rate by Sex')

plt.subplot(1,2,2)
sns.barplot(x='pclass', y='survived', data=df)
plt.title('Survival rate by Pclass')

plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df[['age','fare','sibsp','parch','survived']].dropna(), 
             hue='survived', corner=True)
plt.suptitle('Pairwise relationships (color = survived)', y=1.02)
plt.show()

In [None]:
plt.figure(figsize=(10,4))
sns.heatmap(df.isnull(), cbar=False)
plt.title('Missing-value heatmap')
plt.show()

df_imputed = df.copy()
if 'age' in df_imputed.columns:
    df_imputed['age'] = df_imputed['age'].fillna(df_imputed['age'].median())
if 'embarked' in df_imputed.columns:
    df_imputed['embarked'] = df_imputed['embarked'].fillna(df_imputed['embarked'].mode()[0])
if 'deck' in df_imputed.columns:
    df_imputed['deck'] = df_imputed['deck'].fillna('Unknown')

df_imputed.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

df_model = df_imputed.copy()

features = []
for c in ['pclass','sex','age','fare','sibsp','parch','embarked']:
    if c in df_model.columns:
        features.append(c)

X = pd.get_dummies(df_model[features], drop_first=True)
y = df_model['survived']

mask = X.notnull().all(axis=1) & y.notnull()
X = X[mask]; y = y[mask]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))