# 🧼 Titanic Dataset - EDA & Data Cleaning

This notebook contains data cleaning and exploratory data analysis (EDA) of the Titanic dataset from Kaggle.


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('train.csv')
df.head()


In [None]:
# Descriptive statistics
df.describe(include='all')


In [None]:
# Check and drop duplicate rows
print(f"Total Duplicate Rows: {df.duplicated().sum()}")
df = df.drop_duplicates()


In [None]:
# Check missing values
df.isnull().sum()


In [None]:
# Drop 'Cabin' due to high null values
df = df.drop(columns=['Cabin'])

# Fill missing 'Age' with median
df['Age'] = df['Age'].fillna(df['Age'].median())

# Fill missing 'Embarked' with mode
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Check again for nulls
df.isnull().sum()


In [None]:
# Plot survival by gender
sns.set(style="darkgrid")
plt.figure(figsize=(6,4))
sns.countplot(x='Sex', hue='Survived', data=df)
plt.title("Survival by Gender")
plt.xlabel("Sex")
plt.ylabel("Count")
plt.legend(title='Survived', labels=['No', 'Yes'])
plt.tight_layout()
plt.show()
