# Titanic - Machine Learning from Disaster

In [None]:
import pandas as pd
import os
import kaggle
import zipfile

DATASET_DIR = "./dataset/titanic"
EXPORT_DIR = "./dataset/preprocessed/titanic/"

if not os.path.exists(DATASET_DIR):
    os.makedirs(DATASET_DIR)
    kaggle.api.authenticate()
    kaggle.api.competition_download_files("titanic", path=DATASET_DIR)
    with zipfile.ZipFile(os.path.join(DATASET_DIR, "titanic.zip"), "r") as zip_ref:
        zip_ref.extractall(DATASET_DIR)
    os.remove(os.path.join(DATASET_DIR, "titanic.zip"))

In [13]:
train_df = pd.read_csv(os.path.join(DATASET_DIR, "train.csv"))
test_df = pd.read_csv(os.path.join(DATASET_DIR, "test.csv"))
gender_submission_df = pd.read_csv(os.path.join(DATASET_DIR, "gender_submission.csv"))

print("Training Data Shape:", train_df.shape)
print("Test Data Shape:", test_df.shape)
print("Gender Submission Shape:", gender_submission_df.shape)

print("\nTraining Data Head:")
print(train_df.head())

nan_sum_per_col = train_df.isna().sum()
print("\nNaN Values in Training Data:")
print(nan_sum_per_col[nan_sum_per_col > 0])

ticket_variety = train_df["Ticket"].nunique()
print("\nNumber of Unique Tickets in Training Data:", ticket_variety)

cabin_variety = train_df["Cabin"].nunique()
print("Number of Unique Cabins in Training Data:", cabin_variety)

Training Data Shape: (891, 12)
Test Data Shape: (418, 11)
Gender Submission Shape: (418, 2)

Training Data Head:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN       

In [14]:
def remove_double_quotes(df):
    """Remove double quotes from all string columns in the DataFrame."""
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].str.replace('"', "", regex=False)
    return df


def combine_dataframes(a, b):
    """Combine two DataFrames on a specified key."""
    if "Survived" in a.columns:
        return a
    return pd.merge(a, b, on="PassengerId", how="outer")


def drop_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Drop specified columns from the DataFrame."""
    COLS = ["PassengerId", "Name", "Ticket"]
    return df.drop(columns=COLS, errors="ignore")


def cabin_to_alphabet(df: pd.DataFrame) -> pd.DataFrame:
    """Extract the first letter of the Cabin column."""
    df["Cabin"] = df["Cabin"].fillna("_").str[0]
    df["Cabin"] = df["Cabin"].replace("_", "NaN")
    return df


def one_hot_encoding(df: pd.DataFrame) -> pd.DataFrame:
    COLS = ["Sex", "Embarked", "Cabin"]
    df = pd.get_dummies(df, columns=COLS, drop_first=True)
    return df


def connect_dataframes(a: pd.DataFrame, b: pd.DataFrame) -> pd.DataFrame:
    """Connect two DataFrames by concatenating them."""
    return pd.concat([a, b], ignore_index=True)


def move_survived_column(df: pd.DataFrame) -> pd.DataFrame:
    """Move the 'Survived' column to the end of the DataFrame."""
    if "Survived" in df.columns:
        cols = [col for col in df.columns if col != "Survived"]
        cols.append("Survived")
        return df[cols]
    return df


def Survived_to_Category(df: pd.DataFrame) -> pd.DataFrame:
    df["Survived"] = df["Survived"].astype("category")
    df["Survived"] = df["Survived"].cat.rename_categories(["Died", "Survived"])
    return df


test_df = combine_dataframes(test_df, gender_submission_df)
train_df = connect_dataframes(train_df, test_df)

train_df = drop_columns(train_df)
train_df = remove_double_quotes(train_df)
train_df = cabin_to_alphabet(train_df)

train_df = one_hot_encoding(train_df)
train_df = move_survived_column(train_df)

train_df = Survived_to_Category(train_df)

assert "Survived" in test_df.columns

In [15]:
os.makedirs(EXPORT_DIR, exist_ok=True)

train_df.to_csv(os.path.join(EXPORT_DIR, 'train.csv'), index=False)