In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

In [78]:
dataset = pd.read_csv('./database/train.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Handling NaN Values

There are two typed of NaN Features.
* Numerical Features
* Categorical Features

In [79]:
nan_features = [f for f in dataset.columns if dataset[f].isnull().sum() > 1]
print('Total number of features having NaN Vals :', len(nan_features), '\n')

nan_percentage = [dataset[f].isnull().sum()/len(dataset) * 100 for f in nan_features]

for idx, f in enumerate(nan_features):
    print(f'NaN Percentage in Feature > {f} :', nan_percentage[idx], '%')

Total number of features having NaN Vals : 3 

NaN Percentage in Feature > Age : 19.865319865319865 %
NaN Percentage in Feature > Cabin : 77.10437710437711 %
NaN Percentage in Feature > Embarked : 0.22446689113355783 %


### 1. Numerical NaN

In [80]:
num_nan = [f for f in nan_features if dataset[f].dtype != 'O']

print('Numerical NaN Features :', num_nan)

Numerical NaN Features : ['Age']


As we saw in EDA, Age contains a lot of outliers.

Simultaneosly, Age and Pclass had a relationship as seen in the box plot in EDA. Thus, we'll replace the NaN values according to that relationship.

In [81]:
def replace_nan(cols):
    age, pclass = cols[0], cols[1]
    
    if pd.isnull(age):
        
        if pclass == 1:
            return 37

        elif pclass == 2:
            return 29

        else:
            return 24
    else:
        return age
    
    
dataset['Age'] = dataset[['Age', 'Pclass']].apply(replace_nan, axis=1)

print('NaN values in Age feature :', dataset['Age'].isnull().sum())

NaN values in Age feature : 0


### 2. Categorical NaN

In [82]:
cat_nan = ['Cabin'] # As they are the only one left!
# NOTE: Embarked also contains NaN values, but they are very low, less than even 1%, thus, we'll drop those samples.

We'll replace the NaN Values with 'Misc'

In [83]:
def replace_cat(data, f, label):
    data[f] = np.where(data[f].isnull(), label, data[f])

for f in cat_nan:
    replace_cat(dataset, f, 'Misc')
    
# Dropping the samples that have NaN Embarked.
dataset.dropna(inplace=True)

In [84]:
print('NaN values in our dataset :', dataset.isnull().sum().sum())

NaN values in our dataset : 0


# Rare Categorical Features

In [85]:
cat_features = [f for f in dataset.columns if dataset[f].dtype == 'O' and f != 'Name']

for f in cat_features:
    data = dataset.copy()
    percent = data.groupby(f)['Survived'].count()*100 / len(data)
    percent = percent[percent > 1].index
    
    dataset[f] = np.where(data[f].isin(percent), data[f], 'others')

# Converting Classes into Numerical Classes

In [86]:
# f = 'Embarked'

# data = dataset.copy()

# for f in cat_features:
#     total_classes = data.groupby(f)['Survived'].count().sort_values().index
#     new_class_map = {old:new for new, old in enumerate(total_classes)}
#     dataset[f] = data[f].map(new_class_map)

In [87]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,others,7.25,Misc,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,others,71.2833,others,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,others,7.925,Misc,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,others,53.1,others,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,others,8.05,Misc,S


# Playing with the Features

We see that, Age and Embarked have only very few classes, thus, we'll one-hot encode them.

In [88]:
sex = pd.get_dummies(dataset['Sex'], drop_first=True)             # We can drop_first because it'll behave just like binary code.
embarked = pd.get_dummies(dataset['Embarked'], drop_first=True)

# Now, we'll add these features to our dataset.
dataset = pd.concat([dataset, sex, embarked], axis=1)

dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,others,7.25,Misc,S,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,others,71.2833,others,C,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,others,7.925,Misc,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,others,53.1,others,S,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,others,8.05,Misc,S,1,0,1


Now that've we've got the one-hot features, we-ll drop the parent features from the dataset.

Additionally, we'll drop the useless features that cannot have any relationship with our target feature.

In [89]:
features_to_be_dropped = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Sex', 'Embarked']

dataset.drop(columns=features_to_be_dropped, inplace=True)

dataset.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


# Feature Scaling

In [12]:
# features_to_be_scaled = [f for f in dataset.columns if f not in ['Name', 'PassengerId', 'Survived']]

# scale = MinMaxScaler()
# scale.fit(dataset[features_to_be_scaled])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [13]:
# dataset = pd.concat([dataset[['PassengerId', 'Name', 'Survived']],
#                      pd.DataFrame(scale.transform(dataset[features_to_be_scaled]), columns=dataset[features_to_be_scaled].columns)], axis=1)

# dataset.head()

Unnamed: 0,PassengerId,Name,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,"Braund, Mr. Owen Harris",0,1.0,1.0,0.271174,0.125,0.0,0.0,0.014151,1.0,1.0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0.0,0.0,0.472229,0.125,0.0,0.0,0.139136,0.0,0.666667
2,3,"Heikkinen, Miss. Laina",1,1.0,0.0,0.321438,0.0,0.0,0.0,0.015469,1.0,1.0
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0.0,0.0,0.434531,0.125,0.0,0.0,0.103644,0.0,1.0
4,5,"Allen, Mr. William Henry",0,1.0,1.0,0.434531,0.0,0.0,0.0,0.015713,1.0,1.0


# Exporting the Dataset

In [90]:
dataset.to_csv(f'./database/train_transformed.csv', index=False)