In the following we attempt to fill in the gaps of the data with 0

Import packages and raw_training data:

In [1]:
import math
import numpy as np
import pandas as pd
import os
import re

os.chdir(r"c:\Users\JosephVovrosh\personal_git\Kaggle-Titanic")
df_train_raw = pd.read_csv('data/train.csv')
df_test_raw = pd.read_csv('data/test.csv')

In the following treatement of data we consider each type in a modular approach to allow for easy implementation of future improvements.

Treatment:
- Names: Get titles
- Survived: unchanged
- Pclass: unchanged
- Sex: 0 for man, 1 for woman
- Age: Group
- SibSp: Use to create family sizes
- Parch: Use to create family sizes
- Ticket: Ignore for now
- Fare: Group into amounts
- Cabin: Split into Cabin and Cabin number
- Embarked: 1 for C, 2 for S, 3 for Q

In [37]:
def get_missing_coordinates(df):
    missing_coords = []
    for row_idx, row in df.iterrows():
        for col in df.columns:
            if pd.isna(row[col]):
                missing_coords.append((row_idx, col))
    return missing_coords

def get_cabin_letter(cabin):
    if pd.isnull(cabin):
        return np.nan
    return cabin[0]

def get_cabin_number(cabin):
    if pd.isnull(cabin):
        return -1
    num = ''.join([c for c in cabin if c.isdigit()])
    return int(num) if num else -1

In [38]:
df_train = pd.DataFrame()

df_train['Sex'] = pd.DataFrame({'Sex': df_train_raw['Sex'].map({'male': 0, 'female': 1})})

df_train['Pclass'] = df_train_raw['Pclass']

pattern = r"\b(Master|Mr|Mrs|Ms|Dr|Miss|Sir|Rev|Capt|Lady|Col|Countess|Don|Jonkheer|Major|Mlle|Mme)\."
vectorized_find = np.vectorize(lambda x: re.findall(pattern, x, flags=re.IGNORECASE), otypes=[object])
titles_found = vectorized_find(np.array(df_train_raw['Name']))
titles_only = [titles[0].capitalize() if titles else 'Unknown' for titles in titles_found]
title_to_int = {title: i for i, title in enumerate(sorted(set(titles_only)))}
title_encoded = [title_to_int[title] for title in titles_only]
df_train['Title'] = title_encoded

df_train_raw['Age'] = df_train_raw['Age'].fillna(-1)
bins = np.arange(-5, df_train_raw['Age'].max() + 10, 5)
labels = list(range(len(bins) - 1))
df_train['Age'] = pd.cut(df_train_raw['Age'], bins=bins, labels=labels, right=False, include_lowest=True)

df_train['FamilySize'] = df_train_raw['SibSp'] + df_train_raw['Parch']

df_train_raw['Fare'] = df_train_raw['Fare'].fillna(-1)
bins = np.arange(-20, df_train_raw['Fare'].max() + 20, 20)
labels = list(range(len(bins) - 1))
df_train['Fare'] = pd.cut(df_train_raw['Fare'], bins=bins, labels=labels, right=False, include_lowest=True)

df_train['Cabin_Letter'] = df_train_raw['Cabin'].apply(get_cabin_letter)
df_train['Cabin_Letter'] = df_train['Cabin_Letter'].astype('category').cat.codes
df_train.loc[df_train['Cabin_Letter'].isnull(), 'Cabin_Letter'] = -1

df_train['Cabin_Number'] = df_train_raw['Cabin'].apply(get_cabin_number)

df_train_raw['Embarked'] = df_train_raw['Embarked'].fillna(-1)
df_train['Embarked'] = pd.DataFrame({'Embarked': df_train_raw['Embarked'].map({-1: -1, 'C': 0, 'S': 1, 'Q': 2})})

df_train['Survived'] = df_train_raw['Survived'] 

print(f"# of training data missing after further clean up: {len(get_missing_coordinates(df_train))}")

# of training data missing after further clean up: 0


In [40]:
df_test = pd.DataFrame()

df_test['Sex'] = pd.DataFrame({'Sex': df_test_raw['Sex'].map({'male': 0, 'female': 1})})

df_test['Pclass'] = df_test_raw['Pclass']

pattern = r"\b(Master|Mr|Mrs|Ms|Dr|Miss|Sir|Rev|Capt|Lady|Col|Countess|Don|Jonkheer|Major|Mlle|Mme)\."
vectorized_find = np.vectorize(lambda x: re.findall(pattern, x, flags=re.IGNORECASE), otypes=[object])
titles_found = vectorized_find(np.array(df_test_raw['Name']))
titles_only = [titles[0].capitalize() if titles else 'Unknown' for titles in titles_found]
title_to_int = {title: i for i, title in enumerate(sorted(set(titles_only)))}
title_encoded = [title_to_int[title] for title in titles_only]
df_test['Title'] = title_encoded

df_test_raw['Age'] = df_test_raw['Age'].fillna(-1)
bins = np.arange(-5, df_test_raw['Age'].max() + 10, 5)
labels = list(range(len(bins) - 1))
df_test['Age'] = pd.cut(df_test_raw['Age'], bins=bins, labels=labels, right=False, include_lowest=True)

df_test['FamilySize'] = df_test_raw['SibSp'] + df_test_raw['Parch']

df_test_raw['Fare'] = df_test_raw['Fare'].fillna(-1)
bins = np.arange(-20, df_test_raw['Fare'].max() + 20, 20)
labels = list(range(len(bins) - 1))
df_test['Fare'] = pd.cut(df_test_raw['Fare'], bins=bins, labels=labels, right=False, include_lowest=True)

df_test['Cabin_Letter'] = df_test_raw['Cabin'].apply(get_cabin_letter)
df_test['Cabin_Letter'] = df_test['Cabin_Letter'].astype('category').cat.codes
df_test.loc[df_test['Cabin_Letter'].isnull(), 'Cabin_Letter'] = -1

df_test['Cabin_Number'] = df_test_raw['Cabin'].apply(get_cabin_number)


df_test_raw['Embarked'] = df_test_raw['Embarked'].fillna(-1)
df_test['Embarked'] = pd.DataFrame({'Embarked': df_test_raw['Embarked'].map({-1: -1, 'C': 0, 'S': 1, 'Q': 2})})

print(f"# of training data missing after further clean up: {len(get_missing_coordinates(df_test))}")

# of training data missing after further clean up: 0


Save clean data:

In [41]:
df_train.to_csv('attempt_2/training_data.csv', index=False)
df_test.to_csv('attempt_2/test_data.csv', index=False)