# 05 - Preparando os dados de teste

O propósito é ajustar os dados de teste nos formatos necessários para os algoritmos de aprendizado.

## Preparando o ambiente

In [1]:
import numpy as np
import pandas as pd

## Carregando os dados

In [2]:
titanic = pd.read_csv('../data/original/test.csv')
# del titanic['Unnamed: 0']
titanic.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Extraindo dados

In [3]:
titanic['Relateds'] = titanic['SibSp'] + titanic['Parch']

In [4]:
titanic.loc[(titanic.Cabin.notnull()), 'Cabin'] = titanic.loc[(titanic.Cabin.notnull()), 'Cabin'].astype(str).str[0]
titanic.rename(columns={'Cabin': 'Floor'}, inplace=True)

In [5]:
titanic.Floor.fillna('SC', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic.Floor.fillna('SC', inplace=True)


In [6]:
titanic['possui_cabine'] = titanic['Floor'] != 'SC'
titanic['acompanhado'] = titanic['Relateds'] > 0

In [7]:
titanic.sample(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Floor,Embarked,Relateds,possui_cabine,acompanhado
158,1050,1,"Borebank, Mr. John James",male,42.0,0,0,110489,26.55,D,S,0,True,False
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,SC,S,0,False,False
228,1120,3,"Everett, Mr. Thomas James",male,40.5,0,0,C.A. 6212,15.1,SC,S,0,False,False
261,1153,3,"Nilsson, Mr. August Ferdinand",male,21.0,0,0,350410,7.8542,SC,S,0,False,False
137,1029,2,"Schmidt, Mr. August",male,26.0,0,0,248659,13.0,SC,S,0,False,False


In [8]:
titanic.isnull().sum()

PassengerId       0
Pclass            0
Name              0
Sex               0
Age              86
SibSp             0
Parch             0
Ticket            0
Fare              1
Floor             0
Embarked          0
Relateds          0
possui_cabine     0
acompanhado       0
dtype: int64

### Tratando a idade
Separando através dos pronomes conforme a exploração inicial.

In [9]:
titles = []
for nome in titanic['Name']:
  titles.append(nome.split(',')[1].split('.')[0].strip())

titanic['title'] = titles
titanic.title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Ms', 'Col', 'Rev', 'Dr', 'Dona'],
      dtype=object)

In [10]:
titanic.groupby(['title', 'Sex']).Age.describe().sort_values(by='std', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
title,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Mrs,female,62.0,38.903226,15.03407,16.0,26.0,36.5,49.5,76.0
Mr,male,183.0,32.0,11.804497,14.0,23.0,28.5,40.0,67.0
Miss,female,64.0,21.774844,10.457716,0.17,17.75,22.0,29.25,45.0
Rev,male,2.0,35.5,7.778175,30.0,32.75,35.5,38.25,41.0
Master,male,17.0,7.406471,4.67247,0.33,5.0,7.0,11.5,14.5
Col,male,2.0,50.0,4.242641,47.0,48.5,50.0,51.5,53.0
Dona,female,1.0,39.0,,39.0,39.0,39.0,39.0,39.0
Dr,male,1.0,53.0,,53.0,53.0,53.0,53.0,53.0
Ms,female,0.0,,,,,,,


In [11]:
pessoa_adulta = ['Mr', 'Mrs', 'Rev', 'Col', 'Dona', 'Dr']
mulher_solteira = ['Miss', 'Ms']
menino_crianca = ['Master']

In [12]:
faixa_etaria = []
for title in titanic['title']:
  if (title in menino_crianca):
    faixa_etaria.append('menino_crianca')
  elif (title in mulher_solteira):
    faixa_etaria.append('mulher_solteira')
  else:
    faixa_etaria.append('pessoa_adulta')

titanic['faixa'] = faixa_etaria
titanic.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Floor,Embarked,Relateds,possui_cabine,acompanhado,title,faixa
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,SC,Q,0,False,False,Mr,pessoa_adulta
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,SC,S,1,False,True,Mrs,pessoa_adulta
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,SC,Q,0,False,False,Mr,pessoa_adulta
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,SC,S,0,False,False,Mr,pessoa_adulta
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,SC,S,2,False,True,Mrs,pessoa_adulta


In [13]:
por_faixa = titanic.groupby(['faixa']).Age.describe()
por_faixa

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
faixa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
menino_crianca,17.0,7.406471,4.67247,0.33,5.0,7.0,11.5,14.5
mulher_solteira,64.0,21.774844,10.457716,0.17,17.75,22.0,29.25,45.0
pessoa_adulta,251.0,33.988048,13.015677,14.0,24.0,30.0,43.0,76.0


In [14]:
sem_idade = titanic.query("Age.isnull().values")
sem_idade.groupby(['faixa']).describe()

Unnamed: 0_level_0,PassengerId,PassengerId,PassengerId,PassengerId,PassengerId,PassengerId,PassengerId,PassengerId,Pclass,Pclass,...,Fare,Fare,Relateds,Relateds,Relateds,Relateds,Relateds,Relateds,Relateds,Relateds
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
faixa,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
menino_crianca,4.0,1228.0,70.94129,1136.0,1207.25,1233.5,1254.25,1309.0,4.0,3.0,...,22.631225,23.45,4.0,1.75,1.258306,0.0,1.5,2.0,2.25,3.0
mulher_solteira,15.0,1111.866667,108.349874,928.0,1035.5,1108.0,1169.5,1302.0,15.0,3.0,...,11.775,69.55,15.0,0.866667,2.587516,0.0,0.0,0.0,0.0,10.0
pessoa_adulta,67.0,1092.507463,113.283684,902.0,996.5,1091.0,1180.5,1308.0,67.0,2.656716,...,15.8396,69.55,67.0,0.58209,1.82698,0.0,0.0,0.0,0.0,10.0


In [15]:
def preenche_idade_por_faixa(nome_faixa):
  mc = por_faixa.query(f"faixa == '{nome_faixa}'")
  i = int((mc['mean'] - (mc['std'])).values[0] + 1)
  f = int((mc['mean'] + (mc['std'])).values[0] - 1)
  n = len(titanic.loc[(titanic.Age.isnull()) & (titanic.faixa == nome_faixa)])
  values = np.random.randint(i, f, n)
  titanic.loc[(titanic.Age.isnull()) & (titanic.faixa == nome_faixa), 'Age'] = values

In [16]:
preenche_idade_por_faixa('menino_crianca')
preenche_idade_por_faixa('mulher_solteira')
preenche_idade_por_faixa('pessoa_adulta')

In [17]:
titanic.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Floor            0
Embarked         0
Relateds         0
possui_cabine    0
acompanhado      0
title            0
faixa            0
dtype: int64

In [18]:
titanic['Age'] = titanic['Age'].astype(int)

### Faixas etárias pelo resultado do agrupamento

Relembrando os grupos etários:

**Faixas de idade identificadas:**

* _Criança/Adolescente_: 0 - 15 [crianca_adolescente]
* _Jovem/Adulto_: 16 - 41 [jovem_adulto]
* _Adulto/Idoso_: 42 - 80 [adulto_idoso]

In [19]:
faixa_etaria = []

for i in titanic['Age']:
    if(i < 16):
        faixa_etaria.append('crianca_adolescente')
    elif(i < 42):
        faixa_etaria.append('jovem_adulto')
    else:
        faixa_etaria.append('adulto_idoso')

titanic['faixa_etaria'] = faixa_etaria
titanic.sample(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Floor,Embarked,Relateds,possui_cabine,acompanhado,title,faixa,faixa_etaria
403,1295,1,"Carrau, Mr. Jose Pedro",male,17,0,0,113059,47.1,SC,S,0,False,False,Mr,pessoa_adulta,jovem_adulto
276,1168,2,"Parker, Mr. Clifford Richard",male,28,0,0,SC 14888,10.5,SC,S,0,False,False,Mr,pessoa_adulta,jovem_adulto
257,1149,3,"Niklasson, Mr. Samuel",male,28,0,0,363611,8.05,SC,S,0,False,False,Mr,pessoa_adulta,jovem_adulto
80,972,3,"Boulos, Master. Akar",male,6,1,1,2678,15.2458,SC,C,2,False,True,Master,menino_crianca,crianca_adolescente
134,1026,3,"Dintcheff, Mr. Valtcho",male,43,0,0,349226,7.8958,SC,S,0,False,False,Mr,pessoa_adulta,adulto_idoso


## Removendo colunas desnecessárias

In [20]:
titanic.drop(columns=['Name', 'Fare', 'title', 'faixa', 'Ticket', 'Embarked'], inplace=True)

## Salvando o resultado

In [21]:
titanic.to_csv('../data/processed/test_processed.csv')

In [22]:
titanic.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Floor,Relateds,possui_cabine,acompanhado,faixa_etaria
0,892,3,male,34,0,0,SC,0,False,False,jovem_adulto
1,893,3,female,47,1,0,SC,1,False,True,adulto_idoso
2,894,2,male,62,0,0,SC,0,False,False,adulto_idoso
3,895,3,male,27,0,0,SC,0,False,False,jovem_adulto
4,896,3,female,22,1,1,SC,2,False,True,jovem_adulto
