# Titanic Dataset – Exploratory Data Analysis
This notebook explores the Titanic dataset using summary statistics, visualizations, and correlation analysis.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

In [None]:
# Load the dataset
df = pd.read_csv('Titanic-Dataset.csv')
df.head()

## Summary Statistics

In [None]:
# Display summary statistics
df.describe()

## Histograms and Boxplots

In [None]:
numeric_cols = ['Age', 'Fare', 'SibSp', 'Parch']
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(14, 16))

for i, col in enumerate(numeric_cols):
    sns.histplot(df[col].dropna(), kde=True, ax=axes[i, 0], color='skyblue')
    axes[i, 0].set_title(f'Histogram of {col}')
    sns.boxplot(x=df[col], ax=axes[i, 1], color='lightgreen')
    axes[i, 1].set_title(f'Boxplot of {col}')

plt.tight_layout()
plt.show()

## Pairplot

In [None]:
sns.pairplot(df[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].dropna(), hue='Survived', diag_kind='kde')
plt.suptitle('Pairplot of Numeric Features by Survival', y=1.02)
plt.show()

## Correlation Matrix

In [None]:
plt.figure(figsize=(10, 6))
corr = df[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

## Key Observations
- Most passengers were in class 3 and had low fare values.
- Higher survival rate among 1st class passengers and those who paid higher fares.
- There are outliers in Fare and Age.
- 'Fare' and 'Pclass' show a strong negative correlation.
- Survival is positively correlated with Fare, negatively with Pclass.

## Handling Missing Values

In [None]:
# Check missing values
df.isnull().sum()

In [None]:
# Fill Age with median, Embarked with mode
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop Cabin (too many missing values)
df.drop(columns=['Cabin'], inplace=True)

# Check again
df.isnull().sum()

## Encoding Categorical Features

In [None]:
# Convert 'Sex' and 'Embarked' to numeric
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
df.head()

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])
df[['Age', 'Fare']].head()

## Final Dataset Preview

In [None]:
df.head()