|Variable|Variável     |Definição	                              |Chave                                         |
|--------|-------------|------------------------------------------|----------------------------------------------|
|survival|sobrevivência|Sobrevivência                             |0 = Não, 1 = Sim                              |
|pclass  |pclass	   |Classe de ingresso                        |1 = 1º, 2 = 2º, 3 = 3º                        |
|ex      |sexo         |Sexo                                      |                                              |
|Age     |Idade em anos| -                                        |                                              |
|sibsp   |sibsp        |Nº de irmãos / cônjuges a bordo do Titanic|	                                             |
|parch   |pergaminho   |Nº de pais / filhos a bordo do Titanic    |                                              |
|ticket  |bilhete      |Número do bilhete	                      |                                              |
|fare    |tarifa       |Tarifa de passageiro                      |	                                             |
|cabin   |cabine       |Número da cabine	                      |                                              |
|embarked|embarcou     |Porto de embarcação                       |C = Cherbourg, Q = Queenstown, S = Southampton|

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from collections import Counter

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

In [3]:
combine = [train, test]

In [4]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract('([A-Za-z]+)\.', expand=False)

In [5]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                                 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 
                                                'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()    

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [6]:
title = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title)
    dataset['Title'] = dataset['Title'].fillna(0)

In [7]:
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [8]:
for dataset in combine:
    idx = dataset['Age'].loc[dataset['Age']<1].index
    age = dataset['Age'].iloc[idx]*100
    dataset['Age'].iloc[idx] = age

    m = dataset['Age'].median()
    dataset['Age'] = dataset['Age'].fillna(m)

In [9]:
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 20, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 20) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
    dataset['Age'] = dataset['Age'].astype(int)

In [10]:
for dataset in combine:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

In [11]:
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

In [12]:
# for dataset in combine:    
#     dataset['FamilySize'] = dataset ['SibSp'] + dataset['Parch'] + 1

#     dataset['IsAlone'] = 0 
#     dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 1

In [13]:
# for dataset in combine:
#     dataset['Tick_num'] = dataset.Ticket.str.extract('\s([0-9]+)')
#     dataset.loc[dataset['Ticket'] == 'LINE', 'Tick_num'] = -1
#     idx = dataset[dataset['Tick_num'].isna()].index
#     dataset['Tick_num'].loc[idx] = dataset['Ticket'].loc[idx]
#     dataset['Tick_num'] = dataset['Tick_num'].astype(int)

In [14]:
for dataset in combine:
    dataset.set_index('PassengerId', inplace=True)

In [15]:
train = train.drop(['Name', 'Ticket', 'Cabin', 'Fare'], axis=1)
test = test.drop(['Name', 'Ticket', 'Cabin', 'Fare'], axis=1)

In [16]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [17]:
x, y = train.drop('Survived', axis=1), train.Survived.copy()

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [19]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()
dectree_clf = DecisionTreeClassifier(max_depth = 10, random_state = 0)
grad_clf = GradientBoostingClassifier()
navies_bayes = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=3)

In [20]:
voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf),
                                         ('svc', svm_clf), ('dt', dectree_clf),
                                         ('grad', grad_clf), ('navie', navies_bayes), 
                                          ('knn', knn)], voting='hard')

In [21]:
voting_clf.fit(x_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',...
                                        

In [39]:
for clf in (log_clf,rnd_clf,svm_clf, dectree_clf,grad_clf, navies_bayes, knn, voting_clf):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8071748878923767
RandomForestClassifier 0.8161434977578476
SVC 0.8430493273542601
DecisionTreeClassifier 0.8251121076233184
GradientBoostingClassifier 0.8161434977578476
GaussianNB 0.8026905829596412
KNeighborsClassifier 0.8430493273542601
VotingClassifier 0.8340807174887892


In [40]:
ptest = voting_clf.predict(test)
prev = pd.Series(ptest, index= test.index, name='Survived')
prev.to_csv("modelo_esemble_8.csv", header=True)    

In [35]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import NearestCentroid
from sklearn import svm

In [24]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=5)

In [25]:
clf = NearestCentroid()

In [36]:
clf = svm.SVC(decision_function_shape='ovo')

In [37]:
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.__class__.__name__, accuracy_score(y_test, y_pred))    

SVC 0.8430493273542601


In [38]:
ptest = clf.predict(test)
prev = pd.Series(ptest, index= test.index, name='Survived')
prev.to_csv("modelo_svc_4.csv", header=True)    