## Feature Engineering and Handling missing values
- Use the titanic dataset
- Fill the missing data
    - Age based on gender, salutation
- How do we use the categorical columns

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import seaborn as sns
import numpy as np
import pandas as pd

In [3]:
df = sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


## Fill missing age

In [4]:
df['age'].isnull().sum()

177

In [5]:
# Calculate median age for each gender
median_age_by_gender = df.groupby('sex')['age'].median()
median_age_by_gender

sex
female    27.0
male      29.0
Name: age, dtype: float64

In [6]:
def fill_missing_age(row):
    if pd.isnull(row['age']):
        return median_age_by_gender[row['sex']]
    else:
        return row['age']

In [7]:
df['age'] = df.apply(fill_missing_age, axis=1)

In [8]:
df['age'].isnull().sum()

0

In [9]:
df.drop(['deck', 'class', 'alive', 'who', 'embark_town'], axis=1, inplace=True)
df.shape

(891, 10)

## Categorigal columns

In [10]:
df.select_dtypes(include=['object']).columns

Index(['sex', 'embarked'], dtype='object')

1. Label Encoding

In [11]:
df['sex'].unique()

array(['male', 'female'], dtype=object)

In [12]:
df['sex'] = df['sex'].map({'male': 0, 'female': 1})

2. One hot encoding

In [13]:
df.dropna(subset=['embarked'], inplace=True)

In [14]:
print(df['embarked'].unique())

['S' 'C' 'Q']


In [15]:
df_encoded = pd.get_dummies(df['embarked'], prefix='embarked', drop_first=True).astype(int)
df_encoded

Unnamed: 0,embarked_Q,embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
...,...,...
886,0,1
887,0,1
888,0,1
889,0,0


In [16]:
df = pd.concat([df, df_encoded], axis=1)
df = df.drop('embarked', axis=1)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,adult_male,alone,embarked_Q,embarked_S
0,0,3,0,22.0,1,0,7.25,True,False,0,1
1,1,1,1,38.0,1,0,71.2833,False,False,0,0
2,1,3,1,26.0,0,0,7.925,False,True,0,1
3,1,1,1,35.0,1,0,53.1,False,False,0,1
4,0,3,0,35.0,0,0,8.05,True,True,0,1


### Classification

In [17]:
df['parch'].unique()

array([0, 1, 2, 5, 3, 4, 6], dtype=int64)

In [18]:
df = pd.get_dummies(df['parch'], prefix='parch', drop_first=True).join(df.drop('pclass', axis=1))
df.head()

Unnamed: 0,parch_1,parch_2,parch_3,parch_4,parch_5,parch_6,survived,sex,age,sibsp,parch,fare,adult_male,alone,embarked_Q,embarked_S
0,False,False,False,False,False,False,0,0,22.0,1,0,7.25,True,False,0,1
1,False,False,False,False,False,False,1,1,38.0,1,0,71.2833,False,False,0,0
2,False,False,False,False,False,False,1,1,26.0,0,0,7.925,False,True,0,1
3,False,False,False,False,False,False,1,1,35.0,1,0,53.1,False,False,0,1
4,False,False,False,False,False,False,0,0,35.0,0,0,8.05,True,True,0,1


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(['survived'], axis=1), df.survived, test_size=0.2, random_state=10)

In [21]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [22]:
lr.score(X_test, y_test)

0.848314606741573