# Desafio Titanic

## Análise dos dados

In [1]:
# Importando as bibliotecas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Importando os DataFrames
train = pd.read_csv(r'C:\Users\LENOVO\Downloads\titanic\train.csv')
test = pd.read_csv(r'C:\Users\LENOVO\Downloads\titanic\test.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [176]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [177]:
# Verificando informações das colunas
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [178]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [179]:
# Verificando o número de variáveis nulas para o treino
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [180]:
# Verificando o número de variáveis nulas para o treino
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [181]:
print(train.Embarked.unique())
print(train.Sex.unique())
print(train.Pclass.unique())
print(train.SibSp.unique())
print(train.Parch.unique())

['S' 'C' 'Q' nan]
['male' 'female']
[3 1 2]
[1 0 3 4 2 5 8]
[0 1 2 5 3 4 6]


In [182]:
# Verificando o número de pessoas que embarcaram em cada porto
train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

## Pré processamento dos dados
### Cuidando dos valores nulos e classificando a variávels 'Embarked'

In [183]:
# Inserindo 'S' nos dois valores nulos da coluna 'Embarked'
train['Embarked'] = train['Embarked'].fillna('S')

In [184]:
# Transformando string em números para 'Embarked'
embarked_mapping = {'S':0, 'C':1, 'Q':2}
train['Embarked'] = [embarked_mapping[item] for item in train['Embarked']]
test['Embarked'] = [embarked_mapping[item] for item in test['Embarked']]

# Transformando string em números para 'Sex'
sex_mapping = {'male':0, 'female':1}
train['Sex'] = [sex_mapping[item] for item in train['Sex']]
test['Sex'] = [sex_mapping[item] for item in test['Sex']]

In [185]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0


### Categorizando os nomes das pessoas por meio dos Títulos

In [186]:
# Criando uma coluna com os títulos dos passageiros
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [187]:
train['Title'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Jonkheer      1
Mme           1
Countess      1
Lady          1
Ms            1
Sir           1
Don           1
Capt          1
Name: Title, dtype: int64

In [188]:
test['Title'].value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Rev         2
Col         2
Dona        1
Ms          1
Dr          1
Name: Title, dtype: int64

In [189]:
# Classificando os títulos
title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, 
                 "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3,
                 "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3, "Mme": 3,"Capt": 3,"Sir": 3 }
train['Title'] = [title_mapping[item] for item in train['Title']]
test['Title'] = [title_mapping[item] for item in test['Title']]

In [190]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,2
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0,2
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,0


### Cuidando dos valores nulos e classificando a variável 'Age'

In [151]:
# Inserindo valores médios para a idade agrupando por sexo e o porto embarcado
train['Age'] = train.groupby(["Sex", "Title"])["Age"].transform(lambda grp: grp.fillna(np.mean(grp))
)
test['Age'] = test.groupby(["Sex", "Title"])["Age"].transform(lambda grp: grp.fillna(np.mean(grp))
)

In [152]:
# Verificando se todos os missing values foram preenchidos no train dataset
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null int64
Title          891 non-null int64
dtypes: float64(2), int64(8), object(3)
memory usage: 90.6+ KB


In [153]:
# Verificando se todos os missing values foram preenchidos no test dataset
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null int64
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null int64
Title          418 non-null int64
dtypes: float64(2), int64(7), object(3)
memory usage: 39.3+ KB


In [154]:
# Categorizando as idades em números 
# Child Age < 13: 0
# Teen 13 <= Age < 18: 1
# Young Adult 18 <= Age < 25: 2
# Adult 25 <= Age < 35: 3
# Mid Age 35 <= Age < 58: 4
# Senior Age >= 58: 5
bins = [0,13,18,25,35,58,110]
labels = [0,1,2,3,4,5]
train['AgeGroup'] = pd.cut(train['Age'], bins=bins, labels=labels, right=False)
test['AgeGroup'] = pd.cut(train['Age'], bins=bins, labels=labels, right=False)
print(train.AgeGroup.unique())
print(test.AgeGroup.unique())

[2, 4, 3, 0, 1, 5]
Categories (6, int64): [0 < 1 < 2 < 3 < 4 < 5]
[2, 4, 3, 0, 1, 5]
Categories (6, int64): [0 < 1 < 2 < 3 < 4 < 5]


### Cuidando dos valores nulos e classificando a variável 'Fare'

In [155]:
# Inserindo um valor de média pela classe no dataset de teste da coluna Fare com um valor faltando
test['Fare'] = test.groupby(["Pclass","Title"])["Fare"].transform(lambda grp: grp.fillna(np.mean(grp))
)
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
Title            0
AgeGroup         0
dtype: int64

In [156]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,AgeGroup
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,2,4
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,1,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0,2,4
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,0,4


In [157]:
# Categorizando Fare em números 
# Fare < 20: 0
# 20 <= Fare < 40: 1
# 40 <= Fare < 100: 2
# Fare >= 100: 3
bins = [0,20,40,100, train.Fare.max() + 1]
labels = [0,1,2,3]
train['FareGroup'] = pd.cut(train['Fare'], bins=bins, labels=labels, right=False)
test['FareGroup'] = pd.cut(train['Fare'], bins=bins, labels=labels, right=False)
print(train.FareGroup.unique())
print(test.FareGroup.unique())

[0, 2, 1, 3]
Categories (4, int64): [0 < 1 < 2 < 3]
[0, 2, 1, 3]
Categories (4, int64): [0 < 1 < 2 < 3]


### Separando as variáveis que serão utilizadas

In [158]:
# Separando as variáveis em X_train, X_test, y_train e y_test
feature_cols = [
    'Pclass',
    'Sex',
    'SibSp',
    'Parch',
    'Embarked',
    'AgeGroup',
    'FareGroup',
    'Title'
]

X_train = train[feature_cols]
X_test = test[feature_cols]
y_train = train['Survived']

In [159]:
X_train

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,AgeGroup,FareGroup,Title
0,3,0,1,0,0,2,0,0
1,1,1,1,0,1,4,2,2
2,3,1,0,0,0,3,0,1
3,1,1,1,0,0,4,2,2
4,3,0,0,0,0,4,0,0
...,...,...,...,...,...,...,...,...
886,2,0,0,0,0,3,0,3
887,1,1,0,0,0,2,1,1
888,3,1,1,2,0,2,1,1
889,1,0,0,0,1,3,1,0


In [160]:
X_test

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,AgeGroup,FareGroup,Title
0,3,0,0,0,2,2,0,0
1,3,1,1,0,0,4,2,2
2,2,0,0,0,2,3,0,0
3,3,0,0,0,0,4,2,0
4,3,1,1,1,0,4,0,2
...,...,...,...,...,...,...,...,...
413,3,0,0,0,0,3,0,0
414,1,1,0,0,1,4,0,3
415,3,0,0,0,0,4,0,0
416,3,0,0,0,0,3,1,0


In [161]:
y_train

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

### Standard Scale

In [162]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [163]:
X_train

array([[ 0.82737724, -0.73769513,  0.43279337, ..., -0.62678715,
        -0.73608055, -0.73374551],
       [-1.56610693,  1.35557354,  0.43279337, ...,  1.02793093,
         1.43201124,  1.33186981],
       [ 0.82737724,  1.35557354, -0.4745452 , ...,  0.20057189,
        -0.73608055,  0.29906215],
       ...,
       [ 0.82737724,  1.35557354,  0.43279337, ..., -0.62678715,
         0.34796535,  0.29906215],
       [-1.56610693, -0.73769513, -0.4745452 , ...,  0.20057189,
         0.34796535, -0.73374551],
       [ 0.82737724, -0.73769513, -0.4745452 , ...,  0.20057189,
        -0.73608055, -0.73374551]])

In [164]:
X_test

array([[ 0.82737724, -0.73769513, -0.4745452 , ..., -0.62678715,
        -0.73608055, -0.73374551],
       [ 0.82737724,  1.35557354,  0.43279337, ...,  1.02793093,
         1.43201124,  1.33186981],
       [-0.36936484, -0.73769513, -0.4745452 , ...,  0.20057189,
        -0.73608055, -0.73374551],
       ...,
       [ 0.82737724, -0.73769513, -0.4745452 , ...,  1.02793093,
        -0.73608055, -0.73374551],
       [ 0.82737724, -0.73769513, -0.4745452 , ...,  0.20057189,
         0.34796535, -0.73374551],
       [ 0.82737724, -0.73769513,  0.43279337, ..., -0.62678715,
        -0.73608055,  2.36467747]])

## Treino e classificação dos diversos modelos de Machine Learning

In [165]:
# Importando as bibliotecas para cada modelo 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# criando uma lista com todos os modelos
classifiers = [
    KNeighborsClassifier(3),
    GaussianNB(),
    LogisticRegression(),
    SVC(),
    DecisionTreeClassifier(criterion = 'entropy', random_state = 0),
    RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0),
    GradientBoostingClassifier()]



In [166]:
# Importando as bibliotecas para Validação cruzada
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [167]:
# Definindo a validação cruzada
kf = StratifiedKFold(n_splits=10)

In [168]:
# Rodando todos os modelos classificadores na validação cruzada
for clf in classifiers:
    score = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    name = clf.__class__.__name__
    print("="*30)
    print(name)
    print(np.mean(score).round(3))

KNeighborsClassifier
0.806
GaussianNB
0.791
LogisticRegression
0.817
SVC
0.826
DecisionTreeClassifier
0.788
RandomForestClassifier
0.806
GradientBoostingClassifier
0.82


In [169]:
# Treinando o melhor modelo 
model = SVC()
model.fit(X_train, y_train)

SVC()

In [170]:
# Predizindo os resultados 
y_pred = model.predict(X_test)

In [171]:
# Salvando um arquivo com as predições
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_pred})
submission.to_csv('submission.csv', index = False)

In [172]:
submission = pd.read_csv('submission.csv')
submission.head(20)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [76]:
test['Survived'] = y_pred
test.head(20)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,2,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,0,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,2,0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,0,0
5,897,3,"Svensson, Mr. Johan Cervin",0,14.0,0,0,7538,9.225,,0,0
6,898,3,"Connolly, Miss. Kate",1,30.0,0,0,330972,7.6292,,2,0
7,899,2,"Caldwell, Mr. Albert Francis",0,26.0,1,1,248738,29.0,,0,0
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",1,18.0,0,0,2657,7.2292,,1,1
9,901,3,"Davies, Mr. John Samuel",0,21.0,2,0,A/4 48871,24.15,,0,0
