In [82]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

### 1. Loading the data

In [22]:
train_path = 'data/train.csv'
test_path =  'data/test.csv'

In [23]:
train_raw_data = pd.read_csv(train_path)
test_raw_data = pd.read_csv(test_path)

### 2. Exploring the data

In [24]:
train_raw_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [25]:
train_raw_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

#### Summary

* Remove 'Cabin' column, because there are a lot of missing values
* Deal with dummy variable such as 'Sex'
* Remove 2 Embarked rows because we couldn't to fill it
* Fill missing values in 'Age' column by the mean of ages
* Remove 'Name', 'PassengerId', 'Ticket' columns, since they don't make any scence
* Normilize all without 'Sex' -> 'Gender'

### 3. Removing unnecessary columns

In [26]:
train_data = train_raw_data.copy()
test_data = test_raw_data.copy()

In [27]:
# I am writing a function since I need to make each operation twice for
# Train and test data

def remove_unnec_columns(data):
    data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    return data

In [28]:
train_data = remove_unnec_columns(train_data)
test_data = remove_unnec_columns(test_data)

### 4. Dealing with the dummy variables 

In [29]:
def data_with_gender_column(data):
    gender_dummy = pd.get_dummies(data['Sex'], drop_first=True)
    gender_column = gender_dummy['male'].map({
                                            True:1,
                                            False:0
                                            })
    data['Gender'] = gender_column
    data = data.drop(['Sex'], axis=1)
    return data

train_data = data_with_gender_column(train_data)
test_data = data_with_gender_column(test_data)

### 5. Filling the ages

In [32]:
def filling_age(data):
    mean = data['Age'].mean()
    data['Age'] = data['Age'].fillna(mean)
    return data

In [35]:
train_data = filling_age(train_data)
test_data = filling_age(test_data)

### 6. Dropping Nas

In [38]:
train_data = train_data.dropna()
test_data = test_data.dropna()

### 7. Checkpoint

In [41]:
train_data_ch1 = train_data.copy()
test_data_ch1 = test_data.copy()

### 8. Dummy Embarked

In [52]:
def dummy_embarked(data):
    embarked = pd.get_dummies(data['Embarked'], drop_first=True)
    q_cols = embarked['Q'].map({True:1, False:0})
    s_cols = embarked['S'].map({True:1, False:0})
    data['S'] = s_cols
    data['Q'] = q_cols
    data = data.drop(['Embarked'], axis=1)
    return data

In [55]:
unscaled_train_data = dummy_embarked(train_data_ch1)
unscaled_test_data = dummy_embarked(test_data_ch1)

### 9. Scaling

In [66]:
train_scaler = StandardScaler()
test_scaler = StandardScaler()

train_haveTo_scale = unscaled_train_data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
test_haveTo_scale = unscaled_test_data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

In [71]:
train_scaler.fit(train_haveTo_scale)
scaled_train = train_scaler.transform(train_haveTo_scale)

test_scaler.fit(test_haveTo_scale)
scaled_test = test_scaler.transform(test_haveTo_scale)

In [79]:
def add_scalers(data, np_scaled):
    data.drop(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], axis=1)
    data['Pclass'] = np_scaled[:,0]
    data['Age'] = np_scaled[:,1]
    data['SibSp'] = np_scaled[:,2]
    data['Parch'] = np_scaled[:,3]
    data['Fare'] = np_scaled[:,4]
    return data

df_scaled_train = add_scalers(unscaled_train_data, scaled_train)
df_scaled_test = add_scalers(unscaled_test_data, scaled_test)

### 10. Balancing the data  

In [109]:
survived = df_scaled_train[df_scaled_train['Survived'] == 1]
not_survived = df_scaled_train[df_scaled_train['Survived'] == 0]

In [110]:
shuffled_not_survived = not_survived.sample(frac=1, random_state=42)
not_survived_balanced = shuffled_not_survived.iloc[:340]

In [113]:
balanced_train_data = pd.concat([survived, not_survived_balanced], ignore_index=True)

In [115]:
balanced_train_data = balanced_train_data.sample(frac=1, random_state=42)

### 11. Saving

In [117]:
balanced_train_data.to_csv('data/preprocessed_train.csv')
df_scaled_test.to_csv('data/preprocessed_test.csv')