### Feature Engineering

Idag så ska vi gå igenom feature engineering. Vi kommer att undersöka ett dataset, och leka runt med lite olika features. 

In [2]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

**Import our data**

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
# Inspect the data
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Inspect the shape
df_train.shape

(891, 12)

In [6]:
# Store the target variable 
y_train = df_train['Survived']

In [7]:
# Concatenate train and test sets

df = pd.concat([df_train.drop('Survived', axis=1), df_test], axis=0)

In [8]:
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Observation

Name-kolumnen är fett jobbig, den behöver vi hantera. Det får ni göra själva! 

In [9]:
df["Name"].head()

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [10]:
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Vi noterar att det finns NaNs i Cabinkolumnen

Detta kan påverka vår survival rate

In [11]:
# Vi kan se att Age, Cabin och Embarked har null-värden.
# Cabin har supermånga! De vill vi hantera!

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   Age          1046 non-null   float64
 5   SibSp        1309 non-null   int64  
 6   Parch        1309 non-null   int64  
 7   Ticket       1309 non-null   object 
 8   Fare         1308 non-null   float64
 9   Cabin        295 non-null    object 
 10  Embarked     1307 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 122.7+ KB


In [12]:
df["Has_Cabin"] = ~df["Cabin"].isnull() # ~ betyder "not"

df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Has_Cabin
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,False
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,True
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,False
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,True
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,False


- Droppa kolumner som inte har användbar information / som vi inte vet vad vi ska göra med

In [13]:
# Drop columns

df.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Has_Cabin
0,3,male,22.0,1,0,7.25,S,False
1,1,female,38.0,1,0,71.2833,C,True
2,3,female,26.0,0,0,7.925,S,False
3,1,female,35.0,1,0,53.1,S,True
4,3,male,35.0,0,0,8.05,S,False


**Handle missing values**

In [14]:
# Impute the missing values for Age, Fare, Embarked
# Embarked is Categorical, so we impute the most common value
# We inspect that in the next cell with value_counts()

df["Age"].fillna(df["Age"].median(), inplace=True)
df["Fare"].fillna(df["Fare"].median(), inplace=True)
df["Embarked"] = df["Embarked"].fillna("S")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Pclass     1309 non-null   int64  
 1   Sex        1309 non-null   object 
 2   Age        1309 non-null   float64
 3   SibSp      1309 non-null   int64  
 4   Parch      1309 non-null   int64  
 5   Fare       1309 non-null   float64
 6   Embarked   1309 non-null   object 
 7   Has_Cabin  1309 non-null   bool   
dtypes: bool(1), float64(2), int64(3), object(2)
memory usage: 83.1+ KB


In [15]:
# Inspect which catergory in Embarked has the most passengers

df["Embarked"].value_counts()

S    916
C    270
Q    123
Name: Embarked, dtype: int64

## Success! 

We have successfully filled in null values! Very good!

## Bin numerical data

We have some numerical data that potentially could be more useful if it was binned. For instance:
- The fare prices seem needlesly varying (7.925 vs 8.0)
- Ages are maybe also suitable for bins

In [16]:
# Binning numerical columns
# We can bin Age and Fare to make them more informative

df["CatAge"] = pd.qcut(df["Age"], q=4, labels=False)
df["CatFare"] = pd.qcut(df["Fare"], q=4, labels=False)
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Has_Cabin,CatAge,CatFare
0,3,male,22.0,1,0,7.25,S,False,0,0
1,1,female,38.0,1,0,71.2833,C,True,3,3
2,3,female,26.0,0,0,7.925,S,False,1,1
3,1,female,35.0,1,0,53.1,S,True,2,3
4,3,male,35.0,0,0,8.05,S,False,2,1


In [17]:
    # Now we can drop age and fare

df.drop(["Age", "Fare"], axis=1, inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,Has_Cabin,CatAge,CatFare
0,3,male,1,0,S,False,0,0
1,1,female,1,0,C,True,3,3
2,3,female,0,0,S,False,1,1
3,1,female,1,0,S,True,2,3
4,3,male,0,0,S,False,2,1


### Family members aboard

We can also create a new feature with the amount of family members aboard

In [19]:
# Create a new column Fam_Size as a combination of SibSp and Parch

df["Fam_Size"] = df["SibSp"] + df["Parch"]
df.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,Has_Cabin,CatAge,CatFare,FamilySize
0,3,male,1,0,S,False,0,0,2
1,1,female,1,0,C,True,3,3,2
2,3,female,0,0,S,False,1,1,1
3,1,female,1,0,S,True,2,3,2
4,3,male,0,0,S,False,2,1,1
