## Import Libraries and Load Dataset

In [None]:
import pandas as pd
import numpy as np

# Load the Titanic dataset
titanic = pd.read_csv('titanic.csv')

## Check Dataset Information

In [None]:
# Check info of the DataFrame
print("Dataset Information:")
print(titanic.info())
print("\nFirst few rows:")
titanic.head()

# 3.1 Warm Up Exercises
## Problem 1 - Sorting

### Task 1: Create DataFrame with only Fare column

In [None]:
# Create a DataFrame called fare that contains only the Fare column
fare = titanic[['Fare']]
print("DataFrame 'fare' (Fare column only):")
fare.head()

### Task 2: Create DataFrame with Pclass and Age columns

In [None]:
# Create a DataFrame called class_age with Pclass and Age columns
class_age = titanic[['Pclass', 'Age']]
print("DataFrame 'class_age' (Pclass and Age columns):")
class_age.head()

### Task 3: Create DataFrame with Survived and Sex columns

In [None]:
# Create a DataFrame called survived_gender with Survived and Sex columns
survived_gender = titanic[['Survived', 'Sex']]
print("DataFrame 'survived_gender' (Survived and Sex columns):")
survived_gender.head()

## Problem 2 - Subsetting
### Subsetting Rows

### Task 1: Filter for Fare > 100

In [None]:
# Filter for fare > 100
fare_gt_100 = titanic[titanic['Fare'] > 100]
print("Passengers with Fare > 100:")
fare_gt_100

### Task 2: Filter for First Class (Pclass == 1)

In [None]:
# Filter for Pclass == 1
first_class = titanic[titanic['Pclass'] == 1]
print("First Class Passengers:")
first_class

### Task 3: Filter for Female passengers under 18

In [None]:
# Filter for Age < 18 and Sex == "female"
female_under_18 = titanic[(titanic['Age'] < 18) & (titanic['Sex'] == 'female')]
print("Female passengers under 18:")
female_under_18

### Subsetting Rows by Categorical Variables

### Task 1: Filter for Embarked == 'C' or 'S'

In [None]:
# Filter for Embarked == "C" or "S"
embarked_c_or_s = titanic[titanic['Embarked'].isin(['C', 'S'])]
print("Passengers who embarked at C or S:")
embarked_c_or_s

### Task 2: Filter for Pclass in [1, 2]

In [None]:
# Filter for Pclass in [1, 2]
first_second_class = titanic[titanic['Pclass'].isin([1, 2])]
print("First or Second Class Passengers:")
first_second_class

# 3.2 Exploratory Data Analysis Practice Exercise - 1

## Handle Missing Values in Age Column

In [None]:
# Handle missing values in Age column by filling with median
median_age = titanic['Age'].median()
titanic['Age'] = titanic['Age'].fillna(median_age)
print(f"Missing Age values filled with median: {median_age}")

## Question 1: Which passenger had the highest fare paid relative to their age?

### Step 1: Add fare_per_year column (Fare / Age)

In [None]:
# Add fare_per_year column
titanic['fare_per_year'] = titanic['Fare'] / titanic['Age']
titanic[['Name', 'Fare', 'Age', 'fare_per_year']].head()

### Step 2: Subset rows where fare_per_year > 5

In [None]:
# Subset rows where fare_per_year > 5
high_fare_age = titanic[titanic['fare_per_year'] > 5]
print(f"Number of passengers with fare_per_year > 5: {len(high_fare_age)}")

### Step 3: Sort by descending fare_per_year

In [None]:
# Sort by descending fare_per_year
high_fare_age_srt = high_fare_age.sort_values('fare_per_year', ascending=False)

### Step 4 & 5: Select Name and fare_per_year columns and view result

In [None]:
# Select only Name and fare_per_year columns
result = high_fare_age_srt[['Name', 'fare_per_year']]

# Look at the result
print("Passengers with fare_per_year > 5 (sorted by highest):")
print(result)
print("\n" + "="*80)
print("Passenger with highest fare relative to age:")
print("="*80)
print(result.iloc[0])

## Question 2: Which adult male passenger paid the highest fare relative to their class?

### Step 1: Add fare_per_class column (Fare / Pclass)

In [None]:
# Add fare_per_class column
titanic['fare_per_class'] = titanic['Fare'] / titanic['Pclass']
titanic[['Name', 'Fare', 'Pclass', 'fare_per_class']].head()

### Step 2: Subset for adult males (Age >= 18 and Sex == 'male')

In [None]:
# Subset for adult males
adult_males = titanic[(titanic['Sex'] == 'male') & (titanic['Age'] >= 18)]
print(f"Number of adult male passengers: {len(adult_males)}")

### Step 3: Sort by descending fare_per_class

In [None]:
# Sort by descending fare_per_class
adult_males_srt = adult_males.sort_values('fare_per_class', ascending=False)

### Step 4 & 5: Select Name, Age, and fare_per_class columns and view result

In [None]:
# Select only Name, Age, and fare_per_class columns
result = adult_males_srt[['Name', 'Age', 'fare_per_class']]

# Look at the result
print("Adult male passengers (sorted by fare_per_class):")
print(result.head(10))
print("\n" + "="*80)
print("Adult male with highest fare relative to class:")
print("="*80)
print(result.iloc[0])

# 3.3 Exploratory Data Analysis with Group-by Method Practice Exercise

## Question 1: What percent of total fare revenue came from each passenger class?

### Step 1: Calculate total Fare paid across all passengers

In [None]:
# Calculate total Fare paid across all passengers
total_fare = titanic['Fare'].sum()
print(f"Total Fare Revenue: ${total_fare:.2f}")

### Step 2-3: Calculate fare totals for each class

In [None]:
# Subset for first class and calculate total fare
first_class_fare = titanic[titanic['Pclass'] == 1]['Fare'].sum()

# Do the same for second and third class
second_class_fare = titanic[titanic['Pclass'] == 2]['Fare'].sum()
third_class_fare = titanic[titanic['Pclass'] == 3]['Fare'].sum()

print(f"First Class Total Fare:  ${first_class_fare:.2f}")
print(f"Second Class Total Fare: ${second_class_fare:.2f}")
print(f"Third Class Total Fare:  ${third_class_fare:.2f}")

### Step 4-5: Calculate proportions and percentages

In [None]:
# Combine fare totals into a list
class_fares = [first_class_fare, second_class_fare, third_class_fare]

# Divide by total fare to get proportions
fare_proportions = [fare / total_fare for fare in class_fares]
fare_percentages = [prop * 100 for prop in fare_proportions]

print("\nFare Revenue by Class:")
print(f"First Class:  ${first_class_fare:.2f} ({fare_percentages[0]:.2f}%)")
print(f"Second Class: ${second_class_fare:.2f} ({fare_percentages[1]:.2f}%)")
print(f"Third Class:  ${third_class_fare:.2f} ({fare_percentages[2]:.2f}%)")

### Alternative: Using GroupBy Method

In [None]:
# Using groupby (more elegant approach)
fare_by_class = titanic.groupby('Pclass')['Fare'].sum()
fare_percentage_by_class = (fare_by_class / total_fare * 100)

print("Fare Revenue Percentage by Class (using GroupBy):")
print(fare_percentage_by_class)

## Question 2: What percent of passengers belonged to each age group?

### Step 1: Create age_group column

In [None]:
# Create age_group column
def categorize_age(age):
    if age < 18:
        return 'child'
    elif age <= 64:
        return 'adult'
    else:
        return 'senior'

titanic['age_group'] = titanic['Age'].apply(categorize_age)
print("Age groups created:")
titanic[['Name', 'Age', 'age_group']].head(10)

### Step 2: Calculate total number of passengers

In [None]:
# Calculate total number of passengers
total_passengers = len(titanic)
print(f"Total Passengers: {total_passengers}")

### Step 3: Count passengers in each age group

In [None]:
# Count passengers in each age group
age_group_counts = titanic['age_group'].value_counts()
print("Passenger Count by Age Group:")
print(age_group_counts)

### Step 4-5: Calculate proportions and display as percentages

In [None]:
# Divide by total to get proportions
age_group_proportions = age_group_counts / total_passengers

# Display as percentages
age_group_percentages = age_group_proportions * 100
print("Percentage of Passengers by Age Group:")
print(age_group_percentages)

### Detailed Breakdown

In [None]:
# More detailed breakdown
print("Detailed Breakdown:")
print("="*50)
for group in ['child', 'adult', 'senior']:
    count = age_group_counts.get(group, 0)
    percentage = age_group_percentages.get(group, 0)
    print(f"{group.capitalize():8s}: {count:4d} passengers ({percentage:.2f}%)")

### Alternative: Using GroupBy Method

In [None]:
# Using groupby
age_group_summary = titanic.groupby('age_group').size()
age_group_pct = (age_group_summary / total_passengers * 100)

print("Age Group Distribution (using GroupBy):")
result_df = pd.DataFrame({
    'Count': age_group_summary,
    'Percentage': age_group_pct
})
print(result_df)