# Let's do some feature engineering with the existing data to make it cooler....

## Setup

In [2]:
# importing that stuff
import pandas as pd
from category_encoders.one_hot import OneHotEncoder

In [3]:
# reading in the training dataset
df_train = pd.read_csv('../data/raw/train.csv')

# repeating the previous cleansing steps from data exploration
# Dropping unnecessary columns
df_train.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace = True)

# Separating the supporting features (X) from the predictor feature (y)
X = df_train.drop(columns = ['Survived'])
y = df_train[['Survived']]

In [4]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB


## Boink! Let's do some shizzle...
### Sort out the gender/sex into their own binary columns

In [6]:
# instantiating the 'One Hot Encoder' object
sex_ohe = OneHotEncoder(use_cat_names = True, handle_unknown = 'ignore')

In [7]:
# encoding the 'Sex' column using OHE
sex_dummies = sex_ohe.fit_transform(X['Sex'])

In [8]:
sex_dummies.head()

Unnamed: 0,Sex_male,Sex_female
0,1,0
1,0,1
2,0,1
3,0,1
4,1,0


In [9]:
# concat(enating) the new dummies set back into the dataset
X = pd.concat([X, sex_dummies], axis = 1)

# dropping the original, now unneeded, 'Sex' column
X.drop(columns = ['Sex'], inplace = True)

In [10]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_male,Sex_female
0,3,22.0,1,0,7.2500,S,1,0
1,1,38.0,1,0,71.2833,C,0,1
2,3,26.0,0,0,7.9250,S,0,1
3,1,35.0,1,0,53.1000,S,0,1
4,3,35.0,0,0,8.0500,S,1,0
...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,S,1,0
887,1,19.0,0,0,30.0000,S,0,1
888,3,,1,2,23.4500,S,0,1
889,1,26.0,0,0,30.0000,C,1,0


### Sort the Pclass so it doesn't appear to be a Class order ;)

In [11]:
# Why the f am I doing this????

df_get_dummies = X.copy()

In [12]:
# encoding Pclass to remove numerical importance
pclass_encodings = pd.get_dummies(df_get_dummies['Pclass'], prefix='class')
df_get_dummies = pd.concat([df_get_dummies, pclass_encodings], axis=1)

## Should I have used a fit transform???

# dropping the original, now unneeded, 'Pclass' column
df_get_dummies.drop(columns = ['Pclass'], inplace = True)

## Now we shall play with the point of embarkation...

Don't forget the blanks ;)

In [13]:
df_get_dummies

Unnamed: 0,Age,SibSp,Parch,Fare,Embarked,Sex_male,Sex_female,class_1,class_2,class_3
0,22.0,1,0,7.2500,S,1,0,0,0,1
1,38.0,1,0,71.2833,C,0,1,1,0,0
2,26.0,0,0,7.9250,S,0,1,0,0,1
3,35.0,1,0,53.1000,S,0,1,1,0,0
4,35.0,0,0,8.0500,S,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,27.0,0,0,13.0000,S,1,0,0,1,0
887,19.0,0,0,30.0000,S,0,1,1,0,0
888,,1,2,23.4500,S,0,1,0,0,1
889,26.0,0,0,30.0000,C,1,0,1,0,0


In [14]:
X = df_get_dummies

In [15]:
X

Unnamed: 0,Age,SibSp,Parch,Fare,Embarked,Sex_male,Sex_female,class_1,class_2,class_3
0,22.0,1,0,7.2500,S,1,0,0,0,1
1,38.0,1,0,71.2833,C,0,1,1,0,0
2,26.0,0,0,7.9250,S,0,1,0,0,1
3,35.0,1,0,53.1000,S,0,1,1,0,0
4,35.0,0,0,8.0500,S,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,27.0,0,0,13.0000,S,1,0,0,1,0
887,19.0,0,0,30.0000,S,0,1,1,0,0
888,,1,2,23.4500,S,0,1,0,0,1
889,26.0,0,0,30.0000,C,1,0,1,0,0


In [16]:
# finding null 'Embarked' records
X[X['Embarked'].isnull()]

Unnamed: 0,Age,SibSp,Parch,Fare,Embarked,Sex_male,Sex_female,class_1,class_2,class_3
61,38.0,0,0,80.0,,0,1,1,0,0
829,62.0,0,0,80.0,,0,1,1,0,0


In [17]:
# instantiating a new One Hot Encoder object for embarked port
embarked_ohe_encoder = OneHotEncoder(use_cat_names = True, handle_unknown = 'ignore')

# one-hot-encoding the 'Embarked' column
embarked_dummies = embarked_ohe_encoder.fit_transform(X['Embarked'])

# merging 'Embarked' dummies back into the X dataset
X = pd.concat([X, embarked_dummies], axis = 1)

# dropping the original 'Embarked' column
X.drop(columns = ['Embarked'], inplace = True)

In [18]:
X.tail()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_male,Sex_female,class_1,class_2,class_3,Embarked_S,Embarked_C,Embarked_Q,Embarked_nan
886,27.0,0,0,13.0,1,0,0,1,0,1,0,0,0
887,19.0,0,0,30.0,0,1,1,0,0,1,0,0,0
888,,1,2,23.45,0,1,0,0,1,1,0,0,0
889,26.0,0,0,30.0,1,0,1,0,0,0,1,0,0
890,32.0,0,0,7.75,1,0,0,0,1,0,0,1,0


## Dealing with classifying the age

I want to go more granular with the age than https://github.com/dkhundley/titanic-byoc/blob/main/notebooks/feature-engineering.ipynb, especially with baby/toddler..young adult/adult

In [21]:
# filling null values with the median age (28)
# _ = 28

# X['Age'].fillna(_, inplace = True)

##### BONK!  Not doing that...

## What's missing?

In [77]:
# loading in the data
test_data = pd.read_csv ('../data/raw/test.csv')
train_data = pd.read_csv("../data/raw/train.csv")

# combining the datasets, this maybe stupid, maybe genius....
# combined=train_data.append(test_data)  # depricated, switching to pd.concat
combined = pd.concat([train_data, test_data])   # Wow, that was easy!

## analysing the data
print('Missing values Percentage: \n\n', round (combined.isnull().sum().sort_values(ascending=False)/len(combined)*100,1))

Missing values Percentage: 

 Cabin          77.5
Survived       31.9
Age            20.1
Embarked        0.2
Fare            0.1
PassengerId     0.0
Pclass          0.0
Name            0.0
Sex             0.0
SibSp           0.0
Parch           0.0
Ticket          0.0
dtype: float64
