**Authors:** Pierina Lopez **rnumber:** r0913865
# Cleaning Titanic Data
This notebook prepares the Titanic dataset for analysis by:
1. Renaming columns for consistency.
2. Removing irrelevant columns.
3. Handling missing values.
4. Encoding categorical variables for machine learning.

In [3]:
# Import necessary libraries
import pandas as pd

# Load dataset
# The train dataset contains passenger details and survival status
df = pd.read_csv('../01_Scrape/titanic_train.csv')

# Rename columns to lowercase and remove spaces for easier handling
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Check the columns in the dataset
print(df.columns)

# Define the columns to drop
columns_to_drop = ['cabin', 'boat', 'body', 'home.dest']

# Drop columns that exist in the dataset
df.drop([col for col in columns_to_drop if col in df.columns], axis=1, inplace=True)


# Handle missing values by imputing the median for numerical columns and mode for categorical columns
df['age'].fillna(df['age'].median(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

# Encode categorical variables into numeric format for machine learning
df = pd.get_dummies(df, columns=['sex', 'embarked'], drop_first=True)

# Save the cleaned dataset for further analysis
df.to_csv('titanic_cleaned.csv', index=False)
print("Cleaned dataset saved as 'titanic_cleaned.csv'.")


Index(['passengerid', 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch',
       'fare', 'embarked'],
      dtype='object')
Cleaned dataset saved as 'titanic_cleaned.csv'.
