# Titanic Dataset EDA Notebook

In [None]:
# 1. Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Settings for better plots
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10,6)

In [None]:
# 2. Load Dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
# 3. Data Overview
print("\n--- Data Info ---")
print(train_df.info())

print("\n--- Data Description ---")
print(train_df.describe())

print("\n--- Missing Values ---")
print(train_df.isnull().sum())

In [None]:
# 4. Univariate Analysis
print("\n--- Target Variable: Survival ---")
sns.countplot(x='Survived', data=train_df)
plt.title('Distribution of Survival')
plt.show()

print("\n--- Categorical Features ---")
categorical_cols = ['Pclass', 'Sex', 'Embarked']
for col in categorical_cols:
    sns.countplot(x=col, data=train_df)
    plt.title(f'Distribution of {col}')
    plt.show()

print("\n--- Numerical Features ---")
train_df[['Age', 'Fare']].hist(bins=30, figsize=(12,6))
plt.suptitle('Histograms of Age and Fare')
plt.show()

print("\n--- Boxplots to Detect Outliers ---")
for col in ['Age', 'Fare']:
    sns.boxplot(x=train_df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

In [None]:
# 5. Bivariate Analysis
print("\n--- Survival vs Categorical Features ---")
for col in categorical_cols:
    sns.countplot(x=col, hue='Survived', data=train_df)
    plt.title(f'Survival Rate by {col}')
    plt.show()

print("\n--- Survival vs Numerical Features ---")
sns.histplot(data=train_df, x='Age', hue='Survived', multiple='stack')
plt.title('Age Distribution by Survival')
plt.show()

sns.histplot(data=train_df, x='Fare', hue='Survived', multiple='stack')
plt.title('Fare Distribution by Survival')
plt.show()

In [None]:
# 6. Multivariate Analysis
print("\n--- Correlation Heatmap ---")
plt.figure(figsize=(10,8))
sns.heatmap(train_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

print("\n--- Pairplot of Selected Features ---")
sns.pairplot(train_df, vars=['Age', 'Fare', 'Pclass', 'SibSp', 'Parch'], hue='Survived')
plt.show()

# 7. Insights and Findings
- Females survived at a much higher rate than males.
- 1st Class passengers had a higher survival rate.
- Higher fare was correlated with higher survival.
- Young children had better survival odds.

# 8. Conclusion
- Key factors influencing survival: Sex, Pclass, Fare, Age.
- Recommend feature engineering and modeling as next steps.