## Settings

In [2]:
import sys
sys.path.append('..')

In [35]:
import os
import random
from scipy import stats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from inputs.funcs import load_data
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [4]:
os.listdir('../inputs/')

['test.csv',
 '__init__.py',
 '__pycache__',
 'funcs.py',
 'train.csv',
 'gender_submission.csv']

In [5]:
train_x, train_y, test_x = load_data()

In [6]:
print('train_x: ', train_x.columns.values)
print('train_y: ', test_x.columns.values)

train_x:  ['PassengerId' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch' 'Ticket' 'Fare'
 'Cabin' 'Embarked']
train_y:  ['PassengerId' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch' 'Ticket' 'Fare'
 'Cabin' 'Embarked']


In [7]:
train_x.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [8]:
print('Col to drop:')
print(['PassengeId'])

Col to drop:
['PassengeId']


In [33]:
print('Cols to normalize:')
print(['Age', 'Fare'])

Cols to normalize:
['Age', 'Fare']


In [34]:
print('Cols to one hot encode:')
print(['PClass'] + train_x.dtypes[~train_x.dtypes.isin([int, float])].index.to_list())

Cols to one hot encode:
['PClass', 'Name', 'Sex', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked']


## Cols to nomalize

In [9]:
print('Cols to normalize:')
print(['Age', 'Fare'])

Cols to normalize:
['Age', 'Fare']


### Age 

In [11]:
train_x, train_y, test_x = load_data()

In [12]:
print('Col has na in train_x: ', train_x['Age'].isna().any())
print('Col has na in test_x: ', test_x['Age'].isna().any())

Col has na in train_x:  True
Col has na in test_x:  True


#### Fill na by target encoding

In [13]:
data_tmp = pd.DataFrame({'Age': train_x['Age'], 'Survived': train_y})
target_mean = data_tmp.groupby('Survived').mean().to_dict()
index_nan_0 = train_x['Age'].isna() & train_y == 0
index_nan_1 = train_x['Age'].isna() & train_y == 1
train_x.loc[index_nan_0, 'Age'] = target_mean['Age'][0]
train_x.loc[index_nan_1, 'Age'] = target_mean['Age'][1]

In [14]:
random.seed(0)
index_nan = test_x['Age'].isna()
test_x['Age'] = test_x['Age'].apply(lambda x:
                                         random.choice([target_mean['Age'][0], target_mean['Age'][1]]) if np.isnan(x) else x
                                    )

In [15]:
scaler = StandardScaler()
train_x['Age'] = scaler.fit_transform(train_x['Age'].values.reshape(len(train_x), -1)).ravel()
test_x['Age'] = scaler.transform(test_x['Age'].values.reshape(len(test_x), -1)).ravel()

In [16]:
train_x[['PassengerId', 'Age']].to_feather('../features/nn/age_train_std_te.feather')
test_x[['PassengerId', 'Age']].to_feather('../features/nn/age_test_std_te.feather')

#### Fill na with average

In [17]:
train_x, train_y, test_x = load_data()

In [18]:
mean = train_x['Age'].mean()
index_nan = train_x['Age'].isna()
train_x.loc[index_nan, 'Age'] = mean

In [19]:
index_nan = test_x['Age'].isna()
test_x.loc[index_nan, 'Age'] = mean

In [20]:
scaler = StandardScaler()
train_x['Age'] = scaler.fit_transform(train_x['Age'].values.reshape(len(train_x), -1)).ravel()
test_x['Age'] = scaler.transform(test_x['Age'].values.reshape(len(test_x), -1)).ravel()

In [21]:
train_x[['PassengerId', 'Age']].to_feather('../features/nn/age_train_std_mean.feather')
test_x[['PassengerId', 'Age']].to_feather('../features/nn/age_test_std_mean.feather')

### Fare

In [22]:
train_x, train_y, test_x = load_data()

In [23]:
print('Col has na in train_x: ', train_x['Age'].isna().any())
print('Col has na in test_x: ', test_x['Age'].isna().any())

Col has na in train_x:  True
Col has na in test_x:  True


#### Fill na by target encoding

In [24]:
data_tmp = pd.DataFrame({'Fare': train_x['Fare'], 'Survived': train_y})
target_mean = data_tmp.groupby('Survived').mean().to_dict()
index_nan_0 = train_x['Fare'].isna() & train_y == 0
index_nan_1 = train_x['Fare'].isna() & train_y == 1
train_x.loc[index_nan_0, 'Fare'] = target_mean['Fare'][0]
train_x.loc[index_nan_1, 'Fare'] = target_mean['Fare'][1]

In [25]:
random.seed(0)
index_nan = test_x['Fare'].isna()
test_x['Fare'] = test_x['Fare'].apply(lambda x:
                                         random.choice([target_mean['Fare'][0], target_mean['Fare'][1]]) if np.isnan(x) else x
                                     )

In [26]:
scaler = StandardScaler()
train_x['Fare'] = scaler.fit_transform(train_x['Fare'].values.reshape(len(train_x), -1)).ravel()
test_x['Fare'] = scaler.transform(test_x['Fare'].values.reshape(len(test_x), -1)).ravel()

In [27]:
train_x[['PassengerId', 'Fare']].to_feather('../features/nn/fare_train_std_te.feather')
test_x[['PassengerId', 'Fare']].to_feather('../features/nn/fare_test_std_te.feather')

#### Fill na with average

In [28]:
train_x, train_y, test_x = load_data()

In [29]:
mean = train_x['Fare'].mean()
index_nan = train_x['Fare'].isna()
train_x.loc[index_nan, 'Fare'] = mean

In [30]:
index_nan = test_x['Fare'].isna()
test_x.loc[index_nan, 'Fare'] = mean

In [31]:
scaler = StandardScaler()
train_x['Fare'] = scaler.fit_transform(train_x['Fare'].values.reshape(len(train_x), -1)).ravel()
test_x['Fare'] = scaler.transform(test_x['Fare'].values.reshape(len(test_x), -1)).ravel()

In [32]:
train_x[['PassengerId', 'Fare']].to_feather('../features/nn/fare_train_std_mean.feather')
test_x[['PassengerId', 'Fare']].to_feather('../features/nn/fare_test_std_mean.feather')

## Cols to one hot encode

In [119]:
train_x, train_y, test_x = load_data()
cat_cols = ['Pclass'] + train_x.dtypes[
                ~train_x.dtypes.isin([np.dtype('int64'), np.dtype('float64')])
            ].index.to_list()
print('Cols to one hot encode:')
print(cat_cols)

Cols to one hot encode:
['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [120]:
train_x[cat_cols].isna().any()

Pclass      False
Name        False
Sex         False
Ticket      False
Cabin        True
Embarked     True
dtype: bool

### Pclass, Sex, Ticket

In [140]:
train_x, train_y, test_x = load_data()

In [141]:
cols_to_ohe = ['Pclass', 'Sex', 'Ticket']
all_x = pd.concat([train_x, test_x])
all_x = all_x[['PassengerId'] + cols_to_ohe]
all_x = pd.get_dummies(all_x, columns=cols_to_ohe)

In [142]:
is_in_train = all_x['PassengerId'].isin(train_x['PassengerId'])
is_in_test = all_x['PassengerId'].isin(test_x['PassengerId'])

In [143]:
all_x[is_in_train].to_feather('../features/nn/train_ohe.feather')
all_x[is_in_test].to_feather('../features/nn/test_ohe.feather')

### Cabin, Embarked

In [165]:
train_x, train_y, test_x = load_data()

In [166]:
cols_to_ohe = ['Cabin', 'Embarked']
all_x = pd.concat([train_x, test_x])
all_x = all_x[['PassengerId'] + cols_to_ohe]

In [167]:
all_x['Cabin'] = all_x['Cabin'].apply(
    lambda x: 'Z' if type(x) == 'float' and np.isnan(x) else x
)
all_x['Embarked'] = all_x['Embarked'].apply(
    lambda x: 'Z' if type(x) == 'float' and np.isnan(x) else x
)

In [168]:
all_x = pd.get_dummies(all_x, columns=cols_to_ohe)

In [169]:
is_in_train = all_x['PassengerId'].isin(train_x['PassengerId'])
is_in_test = all_x['PassengerId'].isin(test_x['PassengerId'])

In [170]:
all_x[is_in_train].to_feather('../features/nn/train_ohe_2_z.feather')
all_x[is_in_test].to_feather('../features/nn/test_ohe_2_z.feather')