In [31]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import multiprocessing

np.random.seed(42)

In [32]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
y_train = train['Survived']

train.drop(['Survived', 'Cabin', 'Ticket', 'Name'], axis=1, inplace=True)
test.drop(['Cabin', 'Ticket', 'Name'], axis=1, inplace=True)

train.set_index('PassengerId', inplace=True)
test.set_index('PassengerId', inplace=True)

In [33]:
genders = train['Sex'].unique()
pclasses = train['Pclass'].unique()

genders.sort()
pclasses.sort()

print(genders)
print(pclasses)

['female' 'male']
[1 2 3]


In [34]:
#Function to separate passengers by 'Sex' and 'Pclass'
def separate_passengers_by_gender_and_class(df, genders, pclasses):
    frames = []
    for gender in genders:
        for pclass in pclasses:
            frame = df[(df['Sex'] == gender) & (df['Pclass'] == pclass)].copy()
            frames.append(frame)
    return frames

In [35]:
first_class_women, second_class_women, third_class_women, first_class_men, second_class_men, third_class_men = separate_passengers_by_gender_and_class(train, genders, pclasses)
frames_list = [first_class_men, second_class_men, third_class_men, first_class_women, second_class_women, third_class_women]

In [36]:
#List comprehension to calculate median age for each dataframe in list of dataframes
frames_list = [first_class_men, second_class_men, third_class_men, first_class_women, second_class_women, third_class_women]
median_ages = [df['Age'].median() for df in frames_list]
median_ages

[40.0, 30.0, 25.0, 35.0, 28.0, 21.5]

In [37]:
#Replace impute missing ages with median ages for each sex and class pair
for i in range (0, len(frames_list)):
    frames_list[i]['Age'].fillna(median_ages[i], inplace=True)

#Stitch together the dataframes
train_data = pd.concat(frames_list)

#Sort the dataframe by PassengerId
train_data.sort_values(by='PassengerId', inplace=True)  

In [38]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 55.7+ KB


In [39]:
#Create list of attributes for preprocessing pipeline
num_attribs = ['SibSp', 'Parch', 'Fare', 'Age']
cat_attribs = ['Pclass', 'Embarked', 'Sex']

In [40]:
#Create pipeline for preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("std_scaler", StandardScaler()),   
])

cat_pipeline = Pipeline([
    ("one_hot_encoder", OneHotEncoder()),
    ("imputer", SimpleImputer(strategy="median")),
])

In [41]:
#Create preprocessing pipeline
from sklearn.compose import ColumnTransformer

preprocess_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])

X_train = preprocess_pipeline.fit_transform(train_data[num_attribs + cat_attribs])


X_train

array([[ 0.43279337, -0.47367361, -0.50244517, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.43279337, -0.47367361,  0.78684529, ...,  0.        ,
         1.        ,  0.        ],
       [-0.4745452 , -0.47367361, -0.48885426, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.43279337,  2.00893337, -0.17626324, ...,  0.        ,
         1.        ,  0.        ],
       [-0.4745452 , -0.47367361, -0.04438104, ...,  0.        ,
         0.        ,  1.        ],
       [-0.4745452 , -0.47367361, -0.49237783, ...,  0.        ,
         0.        ,  1.        ]])

In [42]:
import pandas as pd

pd.to_pickle(X_train, '../data/X_train_v2.pkl')
pd.to_pickle(y_train, '../data/y_train_v2.pkl')

In [43]:
X_test = preprocess_pipeline.transform(test[num_attribs + cat_attribs])

In [44]:
pd.to_pickle(X_test, '../data/X_test_v2.pkl')