# Assignment 1 — Data Acquisition & Exploration (Titanic Dataset)

This notebook helps you practice:
- Data loading
- Cleaning missing values
- Encoding categorical data
- Summary statistics
- Exploratory visualizations


## 📌 Step 1 — Import Libraries

In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='whitegrid')

## 📂 Step 2 — Load Dataset

In [ ]:
# Replace filename if needed
df = pd.read_csv('titanic_data.csv')
df.head()

## 🔎 Step 3 — Inspect Data Structure

In [ ]:
print('Shape:', df.shape)
print('\nColumns:', list(df.columns))
print('\nData Types:')
print(df.dtypes)

## ❓ Step 4 — Missing Values

In [ ]:
print('Missing values per column:')
print(df.isnull().sum())

percent_missing = (df.isnull().sum() / len(df)) * 100
print('\nPercentage missing:')
percent_missing

## 🧹 Step 5 — Clean Missing Values

In [ ]:
# Age — median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Embarked — mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop rows still missing
df.dropna(inplace=True)

df.isnull().sum()

## 🔁 Step 6 — Encode Categorical Features

In [ ]:
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].astype('category').cat.codes
df.head()

## 📊 Step 7 — Summary Statistics

In [ ]:
df.describe()

## 👥 Step 8 — Survival Counts

In [ ]:
df['Survived'].value_counts()

## 🧮 Step 9 — Grouped Survival Analysis

In [ ]:
print('Survival by Gender:')
print(df.groupby('Sex')['Survived'].mean())

print('\nSurvival by Class:')
print(df.groupby('Pclass')['Survived'].mean())

print('\nSurvival by Embarked:')
print(df.groupby('Embarked')['Survived'].mean())

## 📈 Step 10 — Visualizations

In [ ]:
# Survival count
sns.countplot(x='Survived', data=df)
plt.title('Survival Count')
plt.show()

# Gender vs Survival
sns.countplot(x='Sex', hue='Survived', data=df)
plt.title('Survival by Gender')
plt.show()

# Class vs Survival
sns.countplot(x='Pclass', hue='Survived', data=df)
plt.title('Survival by Passenger Class')
plt.show()

## 🎂 Step 11 — Age Distribution

In [ ]:
plt.hist(df['Age'], bins=20)
plt.title('Age Distribution of Passengers')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

## ✅ Assignment Complete!

Interpret results and discuss patterns you observed.