In the following we implement the most simple approach - removing all rows with missing data - to set a benchmark for more complicated data handling.

Import packages ans raw_training data:

In [1]:
import pandas as pd
import os

os.chdir(r"c:\Users\JosephVovrosh\personal_git\Kaggle-Titanic")
df_train_raw = pd.read_csv('data/train.csv')
df_test_raw = pd.read_csv('data/test.csv')

In the following treatement of data we consider each type in a modular approach to allow for easy implementation of future improvements.

Treatment:
- Names ignored
- Survived: unchanged
- Pclass: unchanged
- Sex: 0 for man, 1 for woman
- Age: unchanged
- SibSp: unchanged
- Parch: unchanged
- Ticket: Ignored
- Fare: unchanged
- Cabin: average of 1000*position in alphabet + number
- Embarked: 0 for C, 1 for S

In [2]:
def get_missing_coordinates(df):
    missing_coords = []
    for row_idx, row in df.iterrows():
        for col in df.columns:
            if pd.isna(row[col]):
                missing_coords.append((row_idx, col))
    return missing_coords

def average_cabin_value(code_string):
    parts = code_string.strip().split()
    total = 0

    for part in parts:
        if len(part) == 1:
            return None
        letter = part[0]
        number = int(part[1:])
        letter_value = ord(letter) - ord('A')
        value = 1000 * letter_value + number
        total += value

    return total / len(parts)

In [3]:
df_train = df_train_raw.copy()
df_train = df_train.drop('PassengerId', axis=1)
df_train = df_train.drop('Name', axis=1)
df_train = df_train.drop('Ticket', axis=1)

df_test = df_test_raw.copy()
df_test = df_test.drop('PassengerId', axis=1)
df_test = df_test.drop('Name', axis=1)
df_test = df_test.drop('Ticket', axis=1)

In [4]:
print(f"# of training data missing: {len(get_missing_coordinates(df_train_raw))}")
df_train = df_train.dropna()
print(f"# of training data missing after clean up: {len(get_missing_coordinates(df_train))}")


print(f"# of test data missing: {len(get_missing_coordinates(df_test_raw))}")
df_test = df_test.dropna()
print(f"# of test data missing after clean up: {len(get_missing_coordinates(df_test))}")

# of training data missing: 866
# of training data missing after clean up: 0
# of test data missing: 414
# of test data missing after clean up: 0


In [5]:
df_train_clean = df_train.copy()

for index, row in df_train.iterrows():
    new_row = row.copy()
    new_row['Sex'] = 1 if row['Sex'] == 'female' else 0
    new_row['Cabin'] = average_cabin_value(row['Cabin'])
    new_row['Embarked'] = 1 if row['Embarked'] == 'S' else 0
    df_train_clean.loc[index] = new_row

df_train_clean = df_train_clean.dropna()
df_train_clean.reset_index(drop=True, inplace=True)
print(f"# of training data missing after further clean up: {len(get_missing_coordinates(df_train_clean))}")

# of training data missing after further clean up: 0


In [6]:
df_test_clean = df_test.copy()

for index, row in df_test.iterrows():
    new_row = row.copy()
    new_row['Sex'] = 1 if row['Sex'] == 'female' else 0
    new_row['Cabin'] = average_cabin_value(row['Cabin'])
    new_row['Embarked'] = 1 if row['Embarked'] == 'S' else 0
    df_test_clean.loc[index] = new_row

df_test_clean = df_test_clean.dropna()
df_test_clean.reset_index(drop=True, inplace=True)
print(f"# of test data missing after further clean up: {len(get_missing_coordinates(df_test_clean))}")

# of test data missing after further clean up: 0


Save clean data:

In [7]:
df_train_clean.to_csv('attempt_1/training_data.csv', index=False)
df_test_clean.to_csv('attempt_1/test_data.csv', index=False)