# 🧠 Titanic Dataset Exploratory Data Analysis

This notebook performs an in-depth EDA on the Titanic dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency, ttest_ind

sns.set(style="whitegrid")


In [None]:
# Upload the dataset first or place in the same folder
df = pd.read_csv("train.csv")
df.head()

### ❓ Questions to Explore

1. What factors influenced passenger survival?

2. Does age or gender affect survival rate?

3. Are people in higher classes more likely to survive?

4. Are there missing values?

5. Are there outliers in age or fare?


In [None]:
df.info()
df.describe(include='all')

In [None]:
df.isnull().sum()

In [None]:
# Survival Count
sns.countplot(x='Survived', data=df)
plt.title('Survival Count')
plt.show()

# Age Distribution
sns.histplot(df['Age'].dropna(), kde=True)
plt.title('Age Distribution')
plt.show()


In [None]:
# Gender vs Survival
sns.barplot(x='Sex', y='Survived', data=df)
plt.title('Survival Rate by Gender')
plt.show()

# Class vs Survival
sns.barplot(x='Pclass', y='Survived', data=df)
plt.title('Survival Rate by Class')
plt.show()

# Age vs Survived
sns.boxplot(x='Survived', y='Age', data=df)
plt.title('Age Distribution by Survival')
plt.show()


In [None]:
# Gender survival hypothesis
male_survival = df[df['Sex'] == 'male']['Survived']
female_survival = df[df['Sex'] == 'female']['Survived']

t_stat, p_val = ttest_ind(female_survival, male_survival)
print("T-statistic:", t_stat, " | P-value:", p_val)

if p_val < 0.05:
    print("✅ Statistically significant difference between male and female survival.")
else:
    print("❌ No significant difference.")


In [None]:
# Class vs Survival - Chi-square test
contingency = pd.crosstab(df['Pclass'], df['Survived'])
chi2, p, _, _ = chi2_contingency(contingency)
print("Chi-square:", chi2, "| P-value:", p)

if p < 0.05:
    print("✅ Passenger class has a significant effect on survival.")
else:
    print("❌ No significant relationship.")


In [None]:
# Age Group survival
df['AgeGroup'] = pd.cut(df['Age'], bins=[0,12,18,60,80], labels=['Child','Teen','Adult','Senior'])
sns.barplot(x='AgeGroup', y='Survived', data=df)
plt.title('Survival Rate by Age Group')
plt.show()


In [None]:
# Outliers in Fare
sns.boxplot(x=df['Fare'])
plt.title('Fare Outliers')
plt.show()


### ⚠️ Potential Data Issues

- Missing values in 'Age', 'Cabin', 'Embarked'

- Cabin has too many missing values – consider dropping it

- Convert categorical columns before modeling
