**Authors:** Pierina Lopez **rnumber:** r0913865
# Final Import
This notebook consolidates all cleaning steps into one streamlined process to prepare the Titanic dataset for machine learning.

In [2]:
# Import necessary libraries
import pandas as pd

# Load raw data
# This is the uncleaned dataset directly from the source
df = pd.read_csv('../01_Scrape/titanic_train.csv')

# Rename columns for consistency and easier handling
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Define the columns to drop
columns_to_drop = ['cabin', 'boat', 'body', 'home.dest']

# Drop columns that exist in the dataset
df.drop([col for col in columns_to_drop if col in df.columns], axis=1, inplace=True)

# Handle missing values using median for numerical and mode for categorical data
df['age'].fillna(df['age'].median(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

# Encode categorical variables into numeric format
df = pd.get_dummies(df, columns=['sex', 'embarked'], drop_first=True)

# Save the prepared dataset for machine learning
df.to_csv('titanic_prepared.csv', index=False)
print("Prepared dataset saved as 'titanic_prepared.csv'.")


Prepared dataset saved as 'titanic_prepared.csv'.
