## Settings

In [1]:
import sys
sys.path.append('..')

In [2]:
import os
import random
from scipy import stats
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from inputs.funcs import load_data

In [3]:
os.listdir('../inputs/')

['test.csv',
 '__init__.py',
 '__pycache__',
 'funcs.py',
 'train.csv',
 'gender_submission.csv']

In [4]:
train_x, train_y, test_x = load_data()

In [5]:
print('train_x: ', train_x.columns.values)
print('train_y: ', test_x.columns.values)

train_x:  ['PassengerId' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch' 'Ticket' 'Fare'
 'Cabin' 'Embarked']
train_y:  ['PassengerId' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch' 'Ticket' 'Fare'
 'Cabin' 'Embarked']


In [6]:
train_x.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [7]:
print('Col to drop:')
print(['PassengeId'])

Col to drop:
['PassengeId']


In [8]:
print('Cols to normalize:')
print(['Age', 'Fare'])

Cols to normalize:
['Age', 'Fare']


In [9]:
print('Cols to one hot encode:')
print(['PClass'] + train_x.dtypes[~train_x.dtypes.isin([int, float])].index.to_list())

Cols to one hot encode:
['PClass', 'PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


## Cols to nomalize

### Age 

In [10]:
train_x, train_y, test_x = load_data()

In [11]:
print('Col has na in train_x: ', train_x['Age'].isna().any())
print('Col has na in test_x: ', test_x['Age'].isna().any())

Col has na in train_x:  True
Col has na in test_x:  True


#### Fill na by target encoding

In [12]:
data_tmp = pd.DataFrame({'Age': train_x['Age'], 'Survived': train_y})
target_mean = data_tmp.groupby('Survived').mean().to_dict()
index_nan_0 = train_x['Age'].isna() & train_y == 0
index_nan_1 = train_x['Age'].isna() & train_y == 1
train_x.loc[index_nan_0, 'Age'] = target_mean['Age'][0]
train_x.loc[index_nan_1, 'Age'] = target_mean['Age'][1]

In [13]:
random.seed(0)
index_nan = test_x['Age'].isna()
test_x['Age'] = test_x['Age'].apply(lambda x:
                                         random.choice([target_mean['Age'][0], target_mean['Age'][1]]) if np.isnan(x) else x
                                    )

In [14]:
scaler = StandardScaler()
train_x['Age'] = scaler.fit_transform(train_x['Age'].values.reshape(len(train_x), -1)).ravel()
test_x['Age'] = scaler.transform(test_x['Age'].values.reshape(len(test_x), -1)).ravel()

In [15]:
train_x[['PassengerId', 'Age']].to_feather('../features/nn/age_train_std_te.feather')
test_x[['PassengerId', 'Age']].to_feather('../features/nn/age_test_std_te.feather')

#### Fill na with average

In [16]:
train_x, train_y, test_x = load_data()

In [17]:
mean = train_x['Age'].mean()
index_nan = train_x['Age'].isna()
train_x.loc[index_nan, 'Age'] = mean

In [18]:
index_nan = test_x['Age'].isna()
test_x.loc[index_nan, 'Age'] = mean

In [19]:
scaler = StandardScaler()
train_x['Age'] = scaler.fit_transform(train_x['Age'].values.reshape(len(train_x), -1)).ravel()
test_x['Age'] = scaler.transform(test_x['Age'].values.reshape(len(test_x), -1)).ravel()

In [20]:
train_x[['PassengerId', 'Age']].to_feather('../features/nn/age_train_std_mean.feather')
test_x[['PassengerId', 'Age']].to_feather('../features/nn/age_test_std_mean.feather')

### Fare