# Module 5 - Limpeza e preparação de dados

## 5.1 Identificando e tratando dados ausentes

### Identificando dados ausentes

In [1]:
import pandas as pd
import numpy as np

In [2]:
# construindo um dataframe
df = pd.DataFrame(
    np.random.rand(9, 4) * 100,
    index=[chr(i) for i in range(ord("A"), ord("J"))],
    columns=["coluna%s" %i for i in range(1, 5)]
)
df_2 = df.copy()
df

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,30.635697,52.759081,96.847018,59.796397
B,90.163356,86.951265,4.648834,73.210462
C,75.907645,20.56534,34.591021,38.354876
D,11.774104,30.184424,2.949996,68.205526
E,67.493517,26.489613,78.756341,29.991654
F,22.672417,52.524952,92.034759,85.913331
G,58.281724,45.827446,33.998533,62.361662
H,78.241985,19.128832,25.492364,93.876845
I,39.470494,88.937735,87.780681,20.254174


In [3]:
# tipos das colunas
df.dtypes

coluna1    float64
coluna2    float64
coluna3    float64
coluna4    float64
dtype: object

In [4]:
df['coluna1'] = df['coluna1'].astype(int)
df.dtypes

coluna1      int64
coluna2    float64
coluna3    float64
coluna4    float64
dtype: object

In [5]:
df['coluna3'] = df['coluna3'].astype(str)
df.dtypes

coluna1      int64
coluna2    float64
coluna3     object
coluna4    float64
dtype: object

In [6]:
df

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,30,52.759081,96.84701816941651,59.796397
B,90,86.951265,4.648833603411962,73.210462
C,75,20.56534,34.59102121366824,38.354876
D,11,30.184424,2.9499963618994878,68.205526
E,67,26.489613,78.75634111885834,29.991654
F,22,52.524952,92.03475879518618,85.913331
G,58,45.827446,33.998532598187595,62.361662
H,78,19.128832,25.49236368867439,93.876845
I,39,88.937735,87.780680941862,20.254174


In [7]:
# acessando valor por linha e coluna
df.iloc[4, 2]

'78.75634111885834'

In [8]:
# inserindo um dado faltante no dataframe
df.iloc[1, 0] = np.nan
df.iloc[4, 0] = np.nan
df.iloc[3, 0] = np.nan
df.iloc[8, 0] = np.nan
df.iloc[6, 3] = np.nan
df.iloc[4, 3] = np.nan

In [9]:
# NaN = Not a Number
df

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,30.0,52.759081,96.84701816941651,59.796397
B,,86.951265,4.648833603411962,73.210462
C,75.0,20.56534,34.59102121366824,38.354876
D,,30.184424,2.9499963618994878,68.205526
E,,26.489613,78.75634111885834,
F,22.0,52.524952,92.03475879518618,85.913331
G,58.0,45.827446,33.998532598187595,
H,78.0,19.128832,25.49236368867439,93.876845
I,,88.937735,87.780680941862,20.254174


In [10]:
# identificando valores faltantes
df.isna()
df.isnull()

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,False,False,False,False
B,True,False,False,False
C,False,False,False,False
D,True,False,False,False
E,True,False,False,True
F,False,False,False,False
G,False,False,False,True
H,False,False,False,False
I,True,False,False,False


In [11]:
# identificando apenas de uma coluna
df.coluna1.isna()

A    False
B     True
C    False
D     True
E     True
F    False
G    False
H    False
I     True
Name: coluna1, dtype: bool

In [12]:
# trazer todas as linhas onde a coluna é NaN
df[df.coluna1.isna()]

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
B,,86.951265,4.648833603411962,73.210462
D,,30.184424,2.9499963618994878,68.205526
E,,26.489613,78.75634111885834,
I,,88.937735,87.780680941862,20.254174


In [13]:
# trazer todas as linhas onde a coluna não é NaN
df[~df.coluna1.isna()]

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,30.0,52.759081,96.84701816941651,59.796397
C,75.0,20.56534,34.59102121366824,38.354876
F,22.0,52.524952,92.03475879518618,85.913331
G,58.0,45.827446,33.998532598187595,
H,78.0,19.128832,25.49236368867439,93.876845


In [14]:
# identificando a quantidade de valores NaN em uma coluna
df.coluna1.isna().sum()

4

In [15]:
# identificando a quantidade de valores NaN no dataframe
# true = 1
# false = 0
df.isna().sum()

coluna1    4
coluna2    0
coluna3    0
coluna4    2
dtype: int64

In [16]:
# len(df.index) -> quantidade de linhas de um dataframe
# cáculo para saber a porcentagem de valores faltantes em cada coluna

df.isna().sum() / len(df.index)

coluna1    0.444444
coluna2    0.000000
coluna3    0.000000
coluna4    0.222222
dtype: float64

### Tratando dados ausentes

In [17]:
df['coluna1']

A    30.0
B     NaN
C    75.0
D     NaN
E     NaN
F    22.0
G    58.0
H    78.0
I     NaN
Name: coluna1, dtype: float64

In [18]:
# subistituindo por zero

df['coluna1'].fillna(0)

A    30.0
B     0.0
C    75.0
D     0.0
E     0.0
F    22.0
G    58.0
H    78.0
I     0.0
Name: coluna1, dtype: float64

In [19]:
# subistituindo pela media
mean = df['coluna1'].mean() # média

df['coluna1'].fillna(mean)

A    30.0
B    52.6
C    75.0
D    52.6
E    52.6
F    22.0
G    58.0
H    78.0
I    52.6
Name: coluna1, dtype: float64

In [20]:
# subistituindo pela mediana

median = df['coluna1'].sort_values().median()

df['coluna1'].fillna(median)

A    30.0
B    58.0
C    75.0
D    58.0
E    58.0
F    22.0
G    58.0
H    78.0
I    58.0
Name: coluna1, dtype: float64

### Subistituindo usando ffill e o bfill

In [21]:
df['coluna1']

A    30.0
B     NaN
C    75.0
D     NaN
E     NaN
F    22.0
G    58.0
H    78.0
I     NaN
Name: coluna1, dtype: float64

In [22]:
# pega o valor de cima e aplica para os de baixo
df['coluna1'].fillna(method='ffill')

A    30.0
B    30.0
C    75.0
D    75.0
E    75.0
F    22.0
G    58.0
H    78.0
I    78.0
Name: coluna1, dtype: float64

In [23]:
# pega o valor de baixo e aplica para os de cima
df['coluna1'].fillna(method='bfill')

A    30.0
B    75.0
C    75.0
D    22.0
E    22.0
F    22.0
G    58.0
H    78.0
I     NaN
Name: coluna1, dtype: float64

### Dropando as linhas com missing

In [24]:
# dropando todas as linhas que contem valores NaN
df.dropna()

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,30.0,52.759081,96.84701816941651,59.796397
C,75.0,20.56534,34.59102121366824,38.354876
F,22.0,52.524952,92.03475879518618,85.913331
H,78.0,19.128832,25.49236368867439,93.876845


### Dados Duplicados

In [25]:
# criando um dataframe com dados duplicados

df_dup = df.append(df.loc['D': 'H',:]).sort_index()
df_dup

  df_dup = df.append(df.loc['D': 'H',:]).sort_index()


Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,30.0,52.759081,96.84701816941651,59.796397
B,,86.951265,4.648833603411962,73.210462
C,75.0,20.56534,34.59102121366824,38.354876
D,,30.184424,2.9499963618994878,68.205526
D,,30.184424,2.9499963618994878,68.205526
E,,26.489613,78.75634111885834,
E,,26.489613,78.75634111885834,
F,22.0,52.524952,92.03475879518618,85.913331
F,22.0,52.524952,92.03475879518618,85.913331
G,58.0,45.827446,33.998532598187595,


In [26]:
# dropando todas as linhas duplicadas
df_dup.drop_duplicates()

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,30.0,52.759081,96.84701816941651,59.796397
B,,86.951265,4.648833603411962,73.210462
C,75.0,20.56534,34.59102121366824,38.354876
D,,30.184424,2.9499963618994878,68.205526
E,,26.489613,78.75634111885834,
F,22.0,52.524952,92.03475879518618,85.913331
G,58.0,45.827446,33.998532598187595,
H,78.0,19.128832,25.49236368867439,93.876845
I,,88.937735,87.780680941862,20.254174


In [27]:
# dropando valores duplicados por analisando uma coluna
df_dup.drop_duplicates(subset=['coluna1'])

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,30.0,52.759081,96.84701816941651,59.796397
B,,86.951265,4.648833603411962,73.210462
C,75.0,20.56534,34.59102121366824,38.354876
F,22.0,52.524952,92.03475879518618,85.913331
G,58.0,45.827446,33.998532598187595,
H,78.0,19.128832,25.49236368867439,93.876845


### Mapeamentos (map)

In [28]:
# lista randomica de 0 ou 1 de quinze digitos

rand_list = [np.random.randint(0, 2) for i in range(0, 15)]

In [29]:
genero = pd.Series(rand_list)
genero

0     1
1     1
2     1
3     0
4     1
5     0
6     1
7     1
8     1
9     1
10    1
11    1
12    0
13    0
14    0
dtype: int64

In [30]:
# passa um dicionario para a série e troca os valores mapeados para o valor escolhido
genero.map({1: 'Feminino', 0: 'Masculino'})

0      Feminino
1      Feminino
2      Feminino
3     Masculino
4      Feminino
5     Masculino
6      Feminino
7      Feminino
8      Feminino
9      Feminino
10     Feminino
11     Feminino
12    Masculino
13    Masculino
14    Masculino
dtype: object

## 5.2 Renomeando indices e colunas

### Renomeando colunas

In [31]:
# lista de colunas
df_2.columns

Index(['coluna1', 'coluna2', 'coluna3', 'coluna4'], dtype='object')

In [32]:
df_2.columns = ['segunda', 'terca', 'quarta', 'quinta']
df_2

Unnamed: 0,segunda,terca,quarta,quinta
A,30.635697,52.759081,96.847018,59.796397
B,90.163356,86.951265,4.648834,73.210462
C,75.907645,20.56534,34.591021,38.354876
D,11.774104,30.184424,2.949996,68.205526
E,67.493517,26.489613,78.756341,29.991654
F,22.672417,52.524952,92.034759,85.913331
G,58.281724,45.827446,33.998533,62.361662
H,78.241985,19.128832,25.492364,93.876845
I,39.470494,88.937735,87.780681,20.254174


### Renomeando indices

In [33]:
# criando a coluna com os novos indices
list_index = ["LETRA_" + chr(i) for i in range(ord("A"), ord("J"))]

df_2['coluna_index'] = list_index
df_2

Unnamed: 0,segunda,terca,quarta,quinta,coluna_index
A,30.635697,52.759081,96.847018,59.796397,LETRA_A
B,90.163356,86.951265,4.648834,73.210462,LETRA_B
C,75.907645,20.56534,34.591021,38.354876,LETRA_C
D,11.774104,30.184424,2.949996,68.205526,LETRA_D
E,67.493517,26.489613,78.756341,29.991654,LETRA_E
F,22.672417,52.524952,92.034759,85.913331,LETRA_F
G,58.281724,45.827446,33.998533,62.361662,LETRA_G
H,78.241985,19.128832,25.492364,93.876845,LETRA_H
I,39.470494,88.937735,87.780681,20.254174,LETRA_I


In [34]:
# setando a coluna criada como a coluna index
df_2.set_index('coluna_index')

Unnamed: 0_level_0,segunda,terca,quarta,quinta
coluna_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LETRA_A,30.635697,52.759081,96.847018,59.796397
LETRA_B,90.163356,86.951265,4.648834,73.210462
LETRA_C,75.907645,20.56534,34.591021,38.354876
LETRA_D,11.774104,30.184424,2.949996,68.205526
LETRA_E,67.493517,26.489613,78.756341,29.991654
LETRA_F,22.672417,52.524952,92.034759,85.913331
LETRA_G,58.281724,45.827446,33.998533,62.361662
LETRA_H,78.241985,19.128832,25.492364,93.876845
LETRA_I,39.470494,88.937735,87.780681,20.254174


In [35]:
# reseta o index e dropa a coluna que era index antes
df_2.reset_index(drop=True)

Unnamed: 0,segunda,terca,quarta,quinta,coluna_index
0,30.635697,52.759081,96.847018,59.796397,LETRA_A
1,90.163356,86.951265,4.648834,73.210462,LETRA_B
2,75.907645,20.56534,34.591021,38.354876,LETRA_C
3,11.774104,30.184424,2.949996,68.205526,LETRA_D
4,67.493517,26.489613,78.756341,29.991654,LETRA_E
5,22.672417,52.524952,92.034759,85.913331,LETRA_F
6,58.281724,45.827446,33.998533,62.361662,LETRA_G
7,78.241985,19.128832,25.492364,93.876845,LETRA_H
8,39.470494,88.937735,87.780681,20.254174,LETRA_I


## 5.3 Categorização e dummies

### Categorização

In [36]:
df_imc = pd.DataFrame({
    'nome': ['Fernando', 'Maria', 'Felipe', 'Pedro', 'Bianca', 'Beatriz', 'Patricia', 'Lucia'],
    'imc': [27, 26, 25, 16, 16.7, 17.5, 18.6, 24]
})
df_imc

Unnamed: 0,nome,imc
0,Fernando,27.0
1,Maria,26.0
2,Felipe,25.0
3,Pedro,16.0
4,Bianca,16.7
5,Beatriz,17.5
6,Patricia,18.6
7,Lucia,24.0


In [37]:
# categorizando pelo imc
df_imc.loc[df_imc['imc'] <= 16.9, 'imc_cat'] = 'muito_abaixo'
df_imc.loc[(df_imc['imc'] > 17) & (df_imc['imc'] <= 18.4), 'imc_cat'] = 'abaixo'
df_imc.loc[(df_imc['imc'] > 18.5) & (df_imc['imc'] <= 24.9), 'imc_cat'] = 'normal'
df_imc.loc[df_imc['imc'] >= 25, 'imc_cat'] = 'acima'

In [38]:
df_imc

Unnamed: 0,nome,imc,imc_cat
0,Fernando,27.0,acima
1,Maria,26.0,acima
2,Felipe,25.0,acima
3,Pedro,16.0,muito_abaixo
4,Bianca,16.7,muito_abaixo
5,Beatriz,17.5,abaixo
6,Patricia,18.6,normal
7,Lucia,24.0,normal


In [39]:
# nova categorizacao pela coluna criada
df_imc.loc[df_imc['imc_cat'] == 'normal', 'imc_cat_2'] = 'saudavel'
df_imc.loc[df_imc['imc_cat'] != 'normal', 'imc_cat_2'] = 'nao_saudavel'

In [40]:
df_imc

Unnamed: 0,nome,imc,imc_cat,imc_cat_2
0,Fernando,27.0,acima,nao_saudavel
1,Maria,26.0,acima,nao_saudavel
2,Felipe,25.0,acima,nao_saudavel
3,Pedro,16.0,muito_abaixo,nao_saudavel
4,Bianca,16.7,muito_abaixo,nao_saudavel
5,Beatriz,17.5,abaixo,nao_saudavel
6,Patricia,18.6,normal,saudavel
7,Lucia,24.0,normal,saudavel


In [41]:
# ordenando por quartis

df_imc['imc'].sort_values()

3    16.0
4    16.7
5    17.5
6    18.6
7    24.0
2    25.0
1    26.0
0    27.0
Name: imc, dtype: float64

In [42]:
df_imc['imc'].quantile([0.25, 0.5, 0.75])

0.25    17.30
0.50    21.30
0.75    25.25
Name: imc, dtype: float64

In [43]:
df_imc.loc[df_imc['imc'] <= 17.3, 'imc_cat_3'] = '1o_quartil'
df_imc.loc[(df_imc['imc'] > 17.30) & (df_imc['imc'] <= 21.30), 'imc_cat_3'] = '2o_quartil'
df_imc.loc[(df_imc['imc'] > 21.30) & (df_imc['imc'] <= 25.25), 'imc_cat_3'] = '3o_quartil'
df_imc.loc[df_imc['imc'] > 25.25, 'imc_cat_3'] = '4o_quartil'

In [44]:
df_imc

Unnamed: 0,nome,imc,imc_cat,imc_cat_2,imc_cat_3
0,Fernando,27.0,acima,nao_saudavel,4o_quartil
1,Maria,26.0,acima,nao_saudavel,4o_quartil
2,Felipe,25.0,acima,nao_saudavel,3o_quartil
3,Pedro,16.0,muito_abaixo,nao_saudavel,1o_quartil
4,Bianca,16.7,muito_abaixo,nao_saudavel,1o_quartil
5,Beatriz,17.5,abaixo,nao_saudavel,2o_quartil
6,Patricia,18.6,normal,saudavel,2o_quartil
7,Lucia,24.0,normal,saudavel,3o_quartil


### Construindo dummies

In [45]:
# cria novas colunas a partir das variáveis categoricas
pd.get_dummies(df_imc['imc_cat_2'])

Unnamed: 0,nao_saudavel,saudavel
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
5,1,0
6,0,1
7,0,1


In [46]:
dummies = pd.get_dummies(df_imc['imc_cat'])
dummies

Unnamed: 0,abaixo,acima,muito_abaixo,normal
0,0,1,0,0
1,0,1,0,0
2,0,1,0,0
3,0,0,1,0
4,0,0,1,0
5,1,0,0,0
6,0,0,0,1
7,0,0,0,1


In [47]:
# inserindo as novas colunas criadas no dataframe
df_imc = pd.concat([df_imc, dummies], axis=1)

In [48]:
df_imc

Unnamed: 0,nome,imc,imc_cat,imc_cat_2,imc_cat_3,abaixo,acima,muito_abaixo,normal
0,Fernando,27.0,acima,nao_saudavel,4o_quartil,0,1,0,0
1,Maria,26.0,acima,nao_saudavel,4o_quartil,0,1,0,0
2,Felipe,25.0,acima,nao_saudavel,3o_quartil,0,1,0,0
3,Pedro,16.0,muito_abaixo,nao_saudavel,1o_quartil,0,0,1,0
4,Bianca,16.7,muito_abaixo,nao_saudavel,1o_quartil,0,0,1,0
5,Beatriz,17.5,abaixo,nao_saudavel,2o_quartil,1,0,0,0
6,Patricia,18.6,normal,saudavel,2o_quartil,0,0,0,1
7,Lucia,24.0,normal,saudavel,3o_quartil,0,0,0,1


## 5.4 Amostragem