In [None]:
# Titanic Dataset Exploratory Data Analysis (EDA)

# 1. Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

In [None]:
# 2. Load dataset
df = pd.read_csv('/kaggle/input/titanic/train.csv')  # Change path if needed

In [None]:
# 3. Initial inspection
print("Shape:", df.shape)
print("\nFirst 5 rows:\n", df.head())
print("\nInfo:\n")
df.info()
print("\nDescription:\n", df.describe(include='all'))

In [None]:
# 4. Missing values
missing = df.isnull().sum()
print("\nMissing Values:\n", missing[missing > 0])

# Visualize missing values
plt.figure(figsize=(8,5))
sns.barplot(x=missing[missing > 0].values, y=missing[missing > 0].index, palette='viridis')
plt.title("Missing Values Count")
plt.xlabel("Count")
plt.ylabel("Feature")
plt.show()

In [None]:
# 5. Univariate Analysis
# Survived
sns.countplot(x='Survived', data=df, palette='Set2')
plt.title("Survival Count")
plt.show()

# Sex
sns.countplot(x='Sex', data=df, palette='Set1')
plt.title("Gender Distribution")
plt.show()

# Pclass
sns.countplot(x='Pclass', data=df, palette='Set3')
plt.title("Passenger Class Distribution")
plt.show()

# Age distribution
plt.figure(figsize=(10, 5))
sns.histplot(df['Age'].dropna(), kde=True, bins=30)
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

In [None]:
# 6. Outlier Detection with Boxplots
plt.figure(figsize=(8, 5))
sns.boxplot(x='Pclass', y='Age', data=df)
plt.title("Boxplot of Age by Pclass")
plt.show()

plt.figure(figsize=(8, 5))
sns.boxplot(x='Survived', y='Fare', data=df)
plt.title("Boxplot of Fare by Survival")
plt.show()

In [None]:
# 7. Bivariate Analysis

# Survival rate by gender
sns.countplot(x='Survived', hue='Sex', data=df, palette='coolwarm')
plt.title("Survival by Gender")
plt.show()

# Survival rate by class
sns.countplot(x='Survived', hue='Pclass', data=df, palette='rocket')
plt.title("Survival by Passenger Class")
plt.show()

# Age vs Survival
plt.figure(figsize=(10, 6))
sns.kdeplot(df.loc[df['Survived'] == 1, 'Age'], label='Survived', shade=True)
sns.kdeplot(df.loc[df['Survived'] == 0, 'Age'], label='Did Not Survive', shade=True)
plt.title("Age Distribution by Survival")
plt.legend()
plt.show()

In [None]:
# 8. Heatmap of Correlations
plt.figure(figsize=(10, 8))
corr = df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# 9. Cleaning Insight (Optional: fill missing values)
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Check again
print("\nMissing After Filling:\n", df.isnull().sum())

# Done!
print("\n✅ EDA Completed.")