# Data Science Brasil - Desafio Kaggle - Titanic

#### Equipe:
    * Ricardo Galiardi 
    * Wanderson Henrique dos Santos

##### Preparação dos Dados

In [None]:
# Importando os módulos

import numpy as np
import pandas as pd                 
import matplotlib.pyplot as plt    
import seaborn as sns; sns.set(style="ticks", color_codes=True)
%matplotlib inline       

In [None]:
# Lista os arquivos

import os
print(os.listdir())

## Datasets

### Treino
    Columns
        * PassengerId: type should be integers
        * Survived: Survived or Not
        * Pclass: Class of Travel
        * Name: Name of Passenger
        * Sex: Gender
        * Age
        * SibSp: Number of Sibling/Spouse aboard
        * Parch: Number of Parent/Child aboard
        * Ticket
        * Fare
        * Cabin
        * Embarked: The port in which a passenger has embarked. C - Cherbourg, S - Southampton, Q = Queenstown
            
### Teste
    Columns
        * PassengerId
        * Pclass
        * Name
        * Sex
        * Age
        * SibSp
        * Parch
        * Ticket
        * Fare
        * Cabin
        * Embarked
        
### Envio
    Columns
        * PassengerId: integer
        * Survived: binary

## Variable Notes
    * survival
        * 0 = No
        * 1 = Yes
    * pclass: A proxy for socio-economic status (SES)
        * 1: 1st = Upper
        * 2: 2nd = Middle
        * 3: 3rd = Lower
    * age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
    * sibsp: The dataset defines family relations in this way...
        * Sibling = brother, sister, stepbrother, stepsister
        * Spouse = husband, wife (mistresses and fiancés were ignored)
    * parch: The dataset defines family relations in this way...
        * Parent = mother, father
        * Child = daughter, son, stepdaughter, stepson
        * Some children travelled only with a nanny, therefore parch=0 for them.
    * embarked: 
        * C = Cherbourg
        * Q = Queenstown
        * S = Southampton  

In [None]:
# Prepara os datasets

# Dados de Treino
dstrain = pd.read_csv("train.csv", names=['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], sep=',', header=0)

# Dados de Teste
dstest = pd.read_csv("test.csv", names=['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], sep=',', header=0)

# Dados para Envio
dssubmission = pd.read_csv("gender_submission.csv", names=['PassengerId', 'Survived'], sep=',', header=0)

In [None]:
# Imprime na tela

dstrain.head()

In [None]:
# Verificando os primeiros registros

dstrain.head()

In [None]:
# Verificando se existem valores nulos

dstrain.isnull().sum(axis=0)

In [None]:
# Normalizar e limpar os dados missing

# Trocando os dados missing da Idade, pela média das idades
dstrain['Age'].fillna(dstrain['Age'].mean(), inplace = True)

# Trocando os dados missing do Local Embarque, pelo valor N
dstrain['Embarked'].fillna('N', inplace = True)

# Trocando os dados missing da Cabine, pelo valor N00
dstrain['Cabin'].fillna('N00', inplace = True)


In [None]:
# Retirando as colunas irrelevantes para o modelo

dstrain.drop(['Name','Ticket','Cabin'], axis = 1, inplace = True)
dstest.drop(['Name','Ticket','Cabin'], axis = 1, inplace = True)

In [None]:
# Tratar os missing values

def process_age(df,cut_points,label_names):
    df["Age"] = df["Age"].fillna(-0.5)
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df
 
cut_points = [-1,0,5,12,18,35,60,100]
label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]
 
dstrain = process_age(train,cut_points,label_names)
dstest = process_age(test,cut_points,label_names)

In [None]:
# Visualizando a correlação em tabela

# Coeficiente de correlação: 
# +1  = forte correlação positiva
# 0   = não há correlação
# -1  = forte correlação negativa
dstrain.corr()

##### Exploração dos Dados

In [None]:
def fnPlotBar(dataset, column):
        plt = dataset.pivot_table(index=column,values='Survived')
        plt.plot.bar()
    
fnPlotBar(dstrain, 'Pclass')
fnPlotBar(dstrain, 'Sex')
fnPlotBar(dstrain, 'SibSp')
fnPlotBar(dstrain, 'Parch')
fnPlotBar(dstrain, 'Embarked')

In [None]:
# Cria uma função para análise gráfica do dataset e das vendas semanais
def fnScatter(dataset, column):
    plt.figure()
    plt.scatter(dataset[column] , dataset['Survived'])
    #plt..plot.bar(dstrain.pivot_table(index=dataset[column],values=dataset['Survived']))
    plt.ylabel('Survived')
    plt.xlabel(column)

fnScatter(dstrain, 'Age')
fnScatter(dstrain, 'Fare')

In [None]:
def plotter(dataset, column):
    #plt.figure()
    #plt.scatter(dataset[column] , dataset['Survived'])
    plt = dstrain.pivot_table(index=column,values='Survived')
    plt.plot.bar()
    #plt.ylabel('Survived')
    #plt.xlabel(column)
    
plotter(dstrain, 'Pclass')
plotter(dstrain, 'Sex')
plotter(dstrain, 'SibSp')
plotter(dstrain, 'Parch')
plotter(dstrain, 'Embarked')

plotter(dstrain, 'Age')
plotter(dstrain, 'Fare')
# class_pivot = train.pivot_table(index="Pclass",values="Survived")
# class_pivot.plot.bar(color='r') # r para indicar a cor vermelha(red)
# plt.show()

In [None]:
# Correlação Entre as Variáveis
fig = plt.figure(figsize=(18, 14))
corr = dstrain.corr()
c = plt.pcolor(corr)
plt.yticks(np.arange(0.5, len(corr.index), 1), corr.index)
plt.xticks(np.arange(0.5, len(corr.columns), 1), corr.columns)
fig.colorbar(c)

In [None]:
# Analisando as Distribuições das Variáveis

sns.pairplot(dstrain, vars=['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'])

for name, group in dstrain.groupby(["Cabin", "Embarked"]):
    plt.title(name)
    plt.scatter(range(len(group)), group["Survived"])
    plt.show()
    break

In [None]:


survival