# Preprocessing data in Titanic data

In [154]:

# imports libary

import pandas as pd
import numpy as np

import sklearn 

In [155]:

#import data

data = pd.read_csv('./data/train.csv')

In [156]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [157]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [158]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Preprocessing

### Missing values

In [159]:
data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [160]:

from sklearn.impute import MissingIndicator

missing_indicator = MissingIndicator(missing_values=np.NaN)
boolMissingValues = missing_indicator.fit_transform(data)

dFboolMissingValues = pd.DataFrame(boolMissingValues, columns=['Age_NaN','Cabin_NaN','Embarked_NaN'])

dFboolMissingValues

Unnamed: 0,Age_NaN,Cabin_NaN,Embarked_NaN
0,False,True,False
1,False,False,False
2,False,True,False
3,False,False,False
4,False,True,False
...,...,...,...
886,False,True,False
887,False,False,False
888,True,True,False
889,False,False,False


In [161]:
data = pd.concat([data, dFboolMissingValues], axis=1)
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_NaN,Cabin_NaN,Embarked_NaN
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,False,True,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,False,False,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,False,True,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,False,False,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,False,True,False
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,False,False,False
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,True,True,False
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,False,False,False


In [162]:

data.drop(['Cabin','PassengerId'], axis=1, inplace=True)

In [163]:
data['Age'].fillna(data['Age'].median(), inplace=True)

In [164]:
data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [165]:
data['Embarked'].fillna('S', inplace=True)

In [166]:
data.isna().sum()

Survived        0
Pclass          0
Name            0
Sex             0
Age             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Embarked        0
Age_NaN         0
Cabin_NaN       0
Embarked_NaN    0
dtype: int64

In [167]:
data

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Age_NaN,Cabin_NaN,Embarked_NaN
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S,False,True,False
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,False,False,False
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S,False,True,False
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S,False,False,False
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S,False,True,False
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S,False,False,False
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,S,True,True,False
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C,False,False,False


### Categorical features

In [168]:
from sklearn.preprocessing import OrdinalEncoder # encoder ordinal categorical feature
from sklearn.preprocessing import OneHotEncoder # encoder no ordinal categorical feature

In [169]:
data['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [170]:
# ordinal categorical features

ordEnco = OrdinalEncoder(categories=[1,2,3])
catPclass = pd.Categorical(data['Pclass'], categories=[1,2,3], ordered=True)
labels, unique = pd.factorize(catPclass, sort=True)

data['Pclass'] = labels
data

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Age_NaN,Cabin_NaN,Embarked_NaN
0,0,2,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S,False,True,False
1,1,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,False,False,False
2,1,2,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S,False,True,False
3,1,0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S,False,False,False
4,0,2,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,1,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S,False,True,False
887,1,0,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S,False,False,False
888,0,2,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,S,True,True,False
889,1,0,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C,False,False,False


In [171]:

oneHot = OneHotEncoder(dtype=np.int, sparse=False)
featureNoOrdinalEncoder = oneHot.fit_transform(data[['Sex', 'Embarked']])

print(oneHot.categories_)
dFfeatureNoOrdinalEncoder = pd.DataFrame(featureNoOrdinalEncoder, columns=['female','male','city_C','city_Q','city_S'])
data = pd.concat([data, dFfeatureNoOrdinalEncoder], axis=1)
data

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Age_NaN,Cabin_NaN,Embarked_NaN,female,male,city_C,city_Q,city_S
0,0,2,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S,False,True,False,0,1,0,0,1
1,1,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,False,False,False,1,0,1,0,0
2,1,2,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S,False,True,False,1,0,0,0,1
3,1,0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S,False,False,False,1,0,0,0,1
4,0,2,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S,False,True,False,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,1,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S,False,True,False,0,1,0,0,1
887,1,0,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S,False,False,False,1,0,0,0,1
888,0,2,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,S,True,True,False,1,0,0,0,1
889,1,0,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C,False,False,False,0,1,1,0,0


In [172]:
data.drop(['Sex','Embarked','Name','Ticket'], axis=1, inplace=True)
data

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Age_NaN,Cabin_NaN,Embarked_NaN,female,male,city_C,city_Q,city_S
0,0,2,22.0,1,0,7.2500,False,True,False,0,1,0,0,1
1,1,0,38.0,1,0,71.2833,False,False,False,1,0,1,0,0
2,1,2,26.0,0,0,7.9250,False,True,False,1,0,0,0,1
3,1,0,35.0,1,0,53.1000,False,False,False,1,0,0,0,1
4,0,2,35.0,0,0,8.0500,False,True,False,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,1,27.0,0,0,13.0000,False,True,False,0,1,0,0,1
887,1,0,19.0,0,0,30.0000,False,False,False,1,0,0,0,1
888,0,2,28.0,1,2,23.4500,True,True,False,1,0,0,0,1
889,1,0,26.0,0,0,30.0000,False,False,False,0,1,1,0,0


### Numerical feature

In [179]:
# Discretization

from sklearn.preprocessing import KBinsDiscretizer

disc = KBinsDiscretizer(n_bins=6, encode='ordinal', strategy='quantile')
catNumerical = disc.fit_transform(data[['Age','Fare']])

print(disc.bin_edges_)
dFcatNumerical = pd.DataFrame(catNumerical, columns=['age_cat','fare_cat'])
data = pd.concat([data, dFcatNumerical], axis=1)

[array([ 0.42, 19.  , 25.  , 28.  , 31.  , 40.5 , 80.  ])
 array([  0.        ,   7.775     ,   8.6625    ,  14.4542    ,
        26.        ,  52.36946667, 512.3292    ])]


In [181]:
data.drop(['Age','Fare'], axis=1, inplace=True)
data

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age_NaN,Cabin_NaN,Embarked_NaN,female,male,city_C,city_Q,city_S,age_cat,fare_cat
0,0,2,1,0,False,True,False,0,1,0,0,1,1.0,0.0
1,1,0,1,0,False,False,False,1,0,1,0,0,4.0,5.0
2,1,2,0,0,False,True,False,1,0,0,0,1,2.0,1.0
3,1,0,1,0,False,False,False,1,0,0,0,1,4.0,5.0
4,0,2,0,0,False,True,False,0,1,0,0,1,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,1,0,0,False,True,False,0,1,0,0,1,2.0,2.0
887,1,0,0,0,False,False,False,1,0,0,0,1,1.0,4.0
888,0,2,1,2,True,True,False,1,0,0,0,1,3.0,3.0
889,1,0,0,0,False,False,False,0,1,1,0,0,2.0,4.0


In [183]:

def catBool(x):
    if(x == True):
        x = 1
    else:
        x = 0
    return 0


data['Age_NaN'] = data['Age_NaN'].apply(catBool)
data['Cabin_NaN'] = data['Cabin_NaN'].apply(catBool)
data['Embarked_NaN'] = data['Embarked_NaN'].apply(catBool)

data

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age_NaN,Cabin_NaN,Embarked_NaN,female,male,city_C,city_Q,city_S,age_cat,fare_cat
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,1.308642,0.523008,0.381594,0.0,0.0,0.0,0.352413,0.647587,0.188552,0.08642,0.725028,2.662177,2.551066
std,0.486592,0.836071,1.102743,0.806057,0.0,0.0,0.0,0.47799,0.47799,0.391372,0.281141,0.446751,1.681798,1.715241
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
50%,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,3.0
75%,1.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,4.0,4.0
max,1.0,2.0,8.0,6.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,5.0,5.0


## Applying ML

In [184]:

features = ['Pclass', 'SibSp','Parch','Age_NaN','Cabin_NaN','Embarked_NaN', 'female','male','city_C','city_Q','city_S']
label = 'Survived'


In [186]:

from sklearn.preprocessing import LabelEncoder


labEncod = LabelEncoder()

featTransTrain = []

for i in features:
    featTransTrain.append(labEncod.fit_transform(list(data[i])))
    
z = zip(*featTransTrain)   
# mesmo efeito que: z = zip(feat[0],feat[1],feat[2],feat[3],feat[4],feat[5],feat[6], ...), ou seja,
# zipar uma lista de listas.
    

x = list(z) #features encoded
y = list(labEncod.fit_transform(list(data[label]))) #label encoded

In [187]:
# Aplicando SVC

from sklearn.svm import SVC

bAccuracySVC = 0
bModelSVC = 0

for i in range(1000):

    x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.2)

    modelSVC = SVC(probability=True)
    modelSVC.fit(x_train, y_train)

    ac = modelSVC.score(x_test,y_test)
    
    if(ac > bAccuracySVC):
        bAccuracySVC = ac
        bModelSVC = modelSVC

print('Accuracy SVC: ', bAccuracySVC)


Accuracy SVC:  0.8770949720670391


In [191]:

from sklearn.neighbors import KNeighborsClassifier

# Aplicando modelo ML, KNeighborsClassifier.
# 

n = 1000 # número de vezes para rodar a aplicação do modelo para se obter a melhor accuracia dentre esse numero de vezes.
bAccuracy = 0 # armazena melhor acuracia obtida.
bModel = 0 # armazena melhor modelo obtido.

for i in range(n):
    
    # separa os dados em train e test, pegando 10% dos dados para test
    x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.1)

    # Cria instancia do modelo KNeighborsClassifier com 9 vizinhos
    model = KNeighborsClassifier(n_neighbors=5)

    # aplica o modelo KNeighborsClassifier aos dados de treino.
    model.fit(x_train,y_train)
    
    # teste o modelo obtido na linha acima aos dados de teste para se obter a acurâcia.
    accuracy = model.score(x_test,y_test)
    
    # verifica e armazena o melhor modelo obtido dentro do número de vezes que se rodou o modelo.
    if(accuracy > bAccuracy):
        bAccuracy = accuracy
        bModel = model

        
# Print out melhor Accuracy obtida        
print('Accuracy KNeighborsClassifier: ', bAccuracy)

Accuracy KNeighborsClassifier:  0.9111111111111111
