## Feature Engineering and Handling missing values
- Use the titanic dataset
- Fill the missing data
    - Age based on gender, salutation
- How do we use the categorical columns

In [1]:
import seaborn as sns
import numpy as np
import pandas as pd

In [2]:
df = sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


## Fill missing age

In [3]:
df['age'].isnull().sum()

177

In [4]:
# Calculate median age for each gender
median_age_by_gender = df.groupby('sex')['age'].median()
median_age_by_gender

sex
female    27.0
male      29.0
Name: age, dtype: float64

In [5]:
def fill_missing_age(row):
    if pd.isnull(row['age']):
        return median_age_by_gender[row['sex']]
    else:
        return row['age']

In [6]:
df['age'] = df.apply(fill_missing_age, axis=1)

In [7]:
df['age'].isnull().sum()

0

## Categorigal columns

In [8]:
df.select_dtypes(include=['object']).columns

Index(['sex', 'embarked', 'who', 'embark_town', 'alive'], dtype='object')

1. Label Encoding

In [9]:
df['sex'].unique()

array(['male', 'female'], dtype=object)

In [10]:
df['sex'] = df['sex'].map({'male': 0, 'female': 1})

In [11]:
df['alive'].unique()

array(['no', 'yes'], dtype=object)

In [12]:
df['alive'] = df['alive'].map({'yes': 1, 'no': 0})

2. One hot encoding

In [13]:
df.dropna(subset=['embarked', 'embark_town'], inplace=True)

In [14]:
print(df['embarked'].unique())
print(df['embark_town'].unique())
print(df['who'].unique())

['S' 'C' 'Q']
['Southampton' 'Cherbourg' 'Queenstown']
['man' 'woman' 'child']


In [15]:
df_encoded = pd.get_dummies(df[['embarked', 'embark_town', 'who']], drop_first=True).astype(int)
df_encoded

Unnamed: 0,embarked_Q,embarked_S,embark_town_Queenstown,embark_town_Southampton,who_man,who_woman
0,0,1,0,1,1,0
1,0,0,0,0,0,1
2,0,1,0,1,0,1
3,0,1,0,1,0,1
4,0,1,0,1,1,0
...,...,...,...,...,...,...
886,0,1,0,1,1,0
887,0,1,0,1,0,1
888,0,1,0,1,0,1
889,0,0,0,0,1,0


In [16]:
df = pd.concat([df, df_encoded], axis=1)
df = df.drop(['embarked', 'embark_town', 'who'], axis=1)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,adult_male,deck,alive,alone,embarked_Q,embarked_S,embark_town_Queenstown,embark_town_Southampton,who_man,who_woman
0,0,3,0,22.0,1,0,7.25,Third,True,,0,False,0,1,0,1,1,0
1,1,1,1,38.0,1,0,71.2833,First,False,C,1,False,0,0,0,0,0,1
2,1,3,1,26.0,0,0,7.925,Third,False,,1,True,0,1,0,1,0,1
3,1,1,1,35.0,1,0,53.1,First,False,C,1,False,0,1,0,1,0,1
4,0,3,0,35.0,0,0,8.05,Third,True,,0,True,0,1,0,1,1,0
