In [1]:
import numpy as np
import pandas as pd

train = pd.read_csv('../data/kaggle/train.csv')
test = pd.read_csv('../data/kaggle/test.csv')

In [2]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
def process(data):
    data_processed = data.copy()

    data_processed['Miss'] = data_processed['Age'].isnull().astype(int)
    data_processed['Sex'] = data_processed['Sex'].map({'female':0, 'male':1})
    data_processed['Embarked'] = data_processed['Embarked'].fillna(data_processed['Embarked'].mode()[0])
    data_processed = pd.get_dummies(data_processed, columns=['Embarked'], prefix='Embarked', dtype=int)
    data_processed.drop(['Name','Ticket', 'Cabin'], axis=1, inplace=True)
    data_processed['Age'] = data_processed['Age'].fillna(data_processed["Age"].median())
    data_processed['Fare'] = data_processed['Fare'].fillna(data_processed['Fare'].median())
    data_processed['Relative'] = data_processed['Parch'] + data_processed['SibSp']
    return data_processed

train_processed = process(train).drop(['PassengerId'], axis=1)
test_processed = process(test)

In [7]:
train_processed.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Miss,Embarked_C,Embarked_Q,Embarked_S,Relative
0,0,3,1,22.0,1,0,7.25,0,0,0,1,1
1,1,1,0,38.0,1,0,71.2833,0,1,0,0,1
2,1,3,0,26.0,0,0,7.925,0,0,0,1,0
3,1,1,0,35.0,1,0,53.1,0,0,0,1,1
4,0,3,1,35.0,0,0,8.05,0,0,0,1,0


In [8]:
train_processed.to_csv('../data/relative/train.csv', index=False)
test_processed.to_csv('../data/relative/test.csv', index=False)

In [4]:
def drop_sibsp(data):
    data_processed = data.copy()

    data_processed['Miss'] = data_processed['Age'].isnull().astype(int)
    data_processed['Sex'] = data_processed['Sex'].map({'female':0, 'male':1})
    data_processed['Embarked'] = data_processed['Embarked'].fillna(data_processed['Embarked'].mode()[0])
    data_processed = pd.get_dummies(data_processed, columns=['Embarked'], prefix='Embarked', dtype=int)
    data_processed.drop(['Name','Ticket', 'Cabin'], axis=1, inplace=True)
    data_processed['Age'] = data_processed['Age'].fillna(data_processed["Age"].median())
    data_processed['Fare'] = data_processed['Fare'].fillna(data_processed['Fare'].median())
    data_processed['Relative'] = data_processed['Parch'] + data_processed['SibSp']
    data_processed.drop(['SibSp'], axis=1, inplace=True)
    return data_processed

train_dropsibsp = drop_sibsp(train).drop(['PassengerId'], axis=1)
test_dropsibsp = drop_sibsp(test)

In [5]:
train_dropsibsp.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Miss,Embarked_C,Embarked_Q,Embarked_S,Relative
0,0,3,1,22.0,0,7.25,0,0,0,1,1
1,1,1,0,38.0,0,71.2833,0,1,0,0,1
2,1,3,0,26.0,0,7.925,0,0,0,1,0
3,1,1,0,35.0,0,53.1,0,0,0,1,1
4,0,3,1,35.0,0,8.05,0,0,0,1,0


In [6]:
train_processed.to_csv('../data/dropsibsp/train.csv', index=False)
test_processed.to_csv('../data/dropsibsp/test.csv', index=False)

In [9]:
def drop_parch(data):
    data_processed = data.copy()

    data_processed['Miss'] = data_processed['Age'].isnull().astype(int)
    data_processed['Sex'] = data_processed['Sex'].map({'female':0, 'male':1})
    data_processed['Embarked'] = data_processed['Embarked'].fillna(data_processed['Embarked'].mode()[0])
    data_processed = pd.get_dummies(data_processed, columns=['Embarked'], prefix='Embarked', dtype=int)
    data_processed.drop(['Name','Ticket', 'Cabin'], axis=1, inplace=True)
    data_processed['Age'] = data_processed['Age'].fillna(data_processed["Age"].median())
    data_processed['Fare'] = data_processed['Fare'].fillna(data_processed['Fare'].median())
    data_processed['Relative'] = data_processed['Parch'] + data_processed['SibSp']
    data_processed.drop(['Parch'], axis=1, inplace=True)
    return data_processed

train_dropparch = drop_parch(train).drop(['PassengerId'], axis=1)
test_dropparch = drop_parch(test)

train_dropparch.to_csv('../data/dropparch/train.csv', index=False)
test_dropparch.to_csv('../data/dropparch/test.csv', index=False)

In [10]:
train_dropparch.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Miss,Embarked_C,Embarked_Q,Embarked_S,Relative
0,0,3,1,22.0,1,7.25,0,0,0,1,1
1,1,1,0,38.0,1,71.2833,0,1,0,0,1
2,1,3,0,26.0,0,7.925,0,0,0,1,0
3,1,1,0,35.0,1,53.1,0,0,0,1,1
4,0,3,1,35.0,0,8.05,0,0,0,1,0


In [14]:
def drop_both(data):
    data_processed = data.copy()

    data_processed['Miss'] = data_processed['Age'].isnull().astype(int)
    data_processed['Sex'] = data_processed['Sex'].map({'female':0, 'male':1})
    data_processed['Embarked'] = data_processed['Embarked'].fillna(data_processed['Embarked'].mode()[0])
    data_processed = pd.get_dummies(data_processed, columns=['Embarked'], prefix='Embarked', dtype=int)
    data_processed.drop(['Name','Ticket', 'Cabin'], axis=1, inplace=True)
    data_processed['Age'] = data_processed['Age'].fillna(data_processed["Age"].median())
    data_processed['Fare'] = data_processed['Fare'].fillna(data_processed['Fare'].median())
    data_processed['Relative'] = data_processed['Parch'] + data_processed['SibSp']
    data_processed.drop(['Parch', 'SibSp'], axis=1, inplace=True)
    return data_processed

train_dropboth = drop_both(train).drop(['PassengerId'], axis=1)
test_dropboth = drop_both(test)

train_dropboth.to_csv('../data/dropboth/train.csv', index=False)
test_dropboth.to_csv('../data/dropboth/test.csv', index=False)

In [15]:
train_dropboth.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Miss,Embarked_C,Embarked_Q,Embarked_S,Relative
0,0,3,1,22.0,7.25,0,0,0,1,1
1,1,1,0,38.0,71.2833,0,1,0,0,1
2,1,3,0,26.0,7.925,0,0,0,1,0
3,1,1,0,35.0,53.1,0,0,0,1,1
4,0,3,1,35.0,8.05,0,0,0,1,0
