# Titanic - Data Cleaning and Exploration

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Import Data

In [None]:
raw_train=pd.read_csv('train.csv')
raw_test=pd.read_csv('test.csv')

In [None]:
train=raw_train
test=raw_test

# Handle Missing Values

In [None]:
#Check the two rows with missing Embarked values
train[pd.isnull(train['Embarked'])]

In [None]:
#Out of all the fields, only Pclass could be related to which port a passenger Embarked from if those three ports 
#are of differnt economic demographics.
#Check the distribution of Embarked across Pclass to see if Pclass 1 predominantly from a specific port.

train.groupby(['Pclass','Embarked']).count()

In [None]:
#Given that the two passengers are Pclass 1, they are most likely to have embarked from S.

In [None]:
#formally - the mode by Pclass
train.groupby('Pclass').agg(pd.Series.mode)

In [None]:
#However the mode for Pclass 1 is the same as the mode for the whole data.
#We assign the modal value to the missing Embarked data (the [0] after mode because mode is a pandas series)
mode=train['Embarked'].mode()
train['Embarked'].fillna(mode[0],inplace=True)

In [None]:
#Show the replaced values
train[train['PassengerId'].isin([62,830])]

In [None]:
#Design New Feature
train['Cabin_Class'] = train['Cabin'].str[0]

In [None]:
#Replace the Missing values in Cabin_Class with "U" for Unknown
train['Cabin_Class'].fillna('U',inplace=True)

In [None]:
train

# One Hot Encoding

Not good practice to use Pandas for One Hot Encoding

## Using Sk-Learn - Encoding More than One Feature

Using sklearn for One Hot Encoding has the disadvantage that the results do not contain Column Labels. So they are difficult to interpret.

### Creating and Concatenating in a Single Step

In [None]:
#OHE for Cabin_Class and Embarked
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [-1,-2])], remainder='passthrough')
train = np.array(ct.fit_transform(train))

In [None]:
train=pd.DataFrame(train)
train

In [None]:
#OneHotEncoder from sklearn automatically drops one column for each feature to deal with Multicollinearity
#ColumnTransformer from sklearn automatically drops the redundant Columns ie. Cabin_Class and Embarked

### Creating and Concatenating in Separate Steps

In [None]:
#Here is an example one hot encoding both Cabin_Class and Sex. We can of course drop Sex and get the same result as above
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False,handle_unknown = 'ignore')
ohe.fit(train[['Cabin_Class','Embarked']])

In [None]:
ohe.categories_

In [None]:
#Transform
temp = ohe.transform(train[['Cabin_Class','Embarked']])


#We could also do fit and transform in the same step as fit_transform

In [None]:
temp

In [None]:
#Concatenate with the original Dataset

train = pd.DataFrame(np.concatenate((np.array(train),temp),axis=1))
train

In [None]:
#OneHotEncoder from sklearn automatically drops one column for each feature to deal with Multicollinearity
#We need to manually drop the redundant Columns we just encoded ie. Cabin_Class and Embarked

train.drop(columns=[10, 11])

## Using Sk-Learn - Encoding Only One Feature

### Creating and Concatenating in a Single Step

In [None]:
#OHE for Cabin_Class
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [-1])], remainder='passthrough')
train = np.array(ct.fit_transform(train))

In [None]:
train=pd.DataFrame(train)
train

In [None]:
#OneHotEncoder from sklearn automatically drops one column for each feature to deal with Multicollinearity
#ColumnTransformer from sklearn automatically drops the redundant Columns ie. Cabin_Class and Embarked

### Creating and Concatenating in Separate Steps

In [None]:
#One Hot Encoding Cabin Class

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False,handle_unknown = 'ignore')

In [None]:
#Here, we need .reshape because scikit doesn't accept 1D array and .values to convert to np array because reshape
#method can be applied to an array, not a series.

ohe.fit(train['Cabin_Class'].values.reshape(-1,1))

In [None]:
train

In [None]:
ohe.categories_

In [None]:
#We could also do fit and transform in the same step using fit_transform
temp = ohe.transform(train['Cabin_Class'].values.reshape(-1,1))

In [None]:
temp

In [None]:
#Further Steps same as with more than one variable

## Using Feature Engine

### Refer the Udemy Feature Engineering Course Content on One Hot Encoding

In [2]:
# for one hot encoding with feature-engine
from feature_engine.encoding import OneHotEncoder as fe_OneHotEncoder

ModuleNotFoundError: No module named 'feature_engine'

In [None]:
ohe_enc = fe_OneHotEncoder(
    top_categories=None,
    variables=['sex', 'embarked'],  # we can select which variables to encode
    drop_last=True)  # to return k-1, false to return k


ohe_enc.fit(X_train.fillna('Missing'))