# Module 5 - Limpeza e preparação de dados

## 5.1 Identificando e tratando dados ausentes

### Identificando dados ausentes

In [1]:
import pandas as pd
import numpy as np

In [28]:
# construindo um dataframe
df = pd.DataFrame(
    np.random.rand(9, 4) * 100,
    index=[chr(i) for i in range(ord("A"), ord("J"))],
    columns=["coluna%s" %i for i in range(1, 5)]
)
df


Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,19.224568,5.334578,42.255191,17.274552
B,23.397801,75.997894,66.286743,50.555908
C,62.425611,14.712186,3.429024,97.32316
D,83.885714,71.14991,35.512545,28.286585
E,59.676419,56.793323,11.266395,4.236952
F,12.370253,83.181236,7.99077,17.765918
G,21.819332,77.891506,49.616287,62.071815
H,28.317081,14.235639,61.72276,36.772142
I,73.945177,1.565151,69.060792,89.420183


In [31]:
# tipos das colunas
df.dtypes

coluna1    float64
coluna2    float64
coluna3    float64
coluna4    float64
dtype: object

In [39]:
df['coluna1'] = df['coluna1'].astype(int)
df.dtypes

coluna1      int64
coluna2    float64
coluna3     object
coluna4    float64
dtype: object

In [40]:
df['coluna3'] = df['coluna3'].astype(str)
df.dtypes

coluna1      int64
coluna2    float64
coluna3     object
coluna4    float64
dtype: object

In [42]:
df

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,19,5.334578,42.255190796214734,17.274552
B,23,75.997894,66.28674306770573,50.555908
C,62,14.712186,3.429024060428465,97.32316
D,83,71.14991,35.51254489778491,28.286585
E,59,56.793323,11.266395108539616,4.236952
F,12,83.181236,7.990770106266865,17.765918
G,21,77.891506,49.61628656373127,62.071815
H,28,14.235639,61.72276021602391,36.772142
I,73,1.565151,69.06079187224358,89.420183


In [45]:
# acessando valor por linha e coluna
df.iloc[4, 2]

'11.266395108539617'

In [56]:
# inserindo um dado faltante no dataframe
df.iloc[1, 0] = np.nan
df.iloc[4, 0] = np.nan
df.iloc[3, 0] = np.nan
df.iloc[8, 0] = np.nan
df.iloc[6, 3] = np.nan
df.iloc[4, 3] = np.nan

In [57]:
# NaN = Not a Number
df

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,19.0,5.334578,42.255190796214734,17.274552
B,,75.997894,66.28674306770573,50.555908
C,62.0,14.712186,3.429024060428465,97.32316
D,,71.14991,35.51254489778491,28.286585
E,,56.793323,,
F,12.0,83.181236,7.990770106266865,17.765918
G,21.0,77.891506,49.61628656373127,
H,28.0,14.235639,61.72276021602391,36.772142
I,,1.565151,69.06079187224358,


In [64]:
# identificando valores faltantes
df.isna()
df.isnull()

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,False,False,False,False
B,True,False,False,False
C,False,False,False,False
D,True,False,False,False
E,True,False,True,True
F,False,False,False,False
G,False,False,False,True
H,False,False,False,False
I,True,False,False,True


In [66]:
# identificando apenas de uma coluna
df.coluna1.isna()

A    False
B     True
C    False
D     True
E     True
F    False
G    False
H    False
I     True
Name: coluna1, dtype: bool

In [71]:
# trazer todas as linhas onde a coluna é NaN
df[df.coluna1.isna()]

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
B,,75.997894,66.28674306770573,50.555908
D,,71.14991,35.51254489778491,28.286585
E,,56.793323,,
I,,1.565151,69.06079187224358,


In [72]:
# trazer todas as linhas onde a coluna não é NaN
df[~df.coluna1.isna()]

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,19.0,5.334578,42.255190796214734,17.274552
C,62.0,14.712186,3.429024060428465,97.32316
F,12.0,83.181236,7.990770106266865,17.765918
G,21.0,77.891506,49.61628656373127,
H,28.0,14.235639,61.72276021602391,36.772142


In [75]:
# identificando a quantidade de valores NaN em uma coluna
df.coluna1.isna().sum()

4

In [77]:
# identificando a quantidade de valores NaN no dataframe
# true = 1
# false = 0
df.isna().sum()

coluna1    4
coluna2    0
coluna3    1
coluna4    3
dtype: int64

In [88]:
# len(df.index) -> quantidade de linhas de um dataframe
# cáculo para saber a porcentagem de valores faltantes em cada coluna

df.isna().sum() / len(df.index)

coluna1    0.444444
coluna2    0.000000
coluna3    0.111111
coluna4    0.333333
dtype: float64

### Tratando dados ausentes

In [90]:
df['coluna1']

A    19.0
B     NaN
C    62.0
D     NaN
E     NaN
F    12.0
G    21.0
H    28.0
I     NaN
Name: coluna1, dtype: float64

In [95]:
# subistituindo por zero

df['coluna1'].fillna(0)

A    19.0
B     0.0
C    62.0
D     0.0
E     0.0
F    12.0
G    21.0
H    28.0
I     0.0
Name: coluna1, dtype: float64

In [100]:
# subistituindo pela media
mean = df['coluna1'].mean() # média

df['coluna1'].fillna(mean)

A    19.0
B    28.4
C    62.0
D    28.4
E    28.4
F    12.0
G    21.0
H    28.0
I    28.4
Name: coluna1, dtype: float64

In [109]:
# subistituindo pela mediana

median = df['coluna1'].sort_values().median()

df['coluna1'].fillna(median)

A    19.0
B    21.0
C    62.0
D    21.0
E    21.0
F    12.0
G    21.0
H    28.0
I    21.0
Name: coluna1, dtype: float64

### Subistituindo usando ffill e o bfill

In [113]:
df['coluna1']

A    19.0
B     NaN
C    62.0
D     NaN
E     NaN
F    12.0
G    21.0
H    28.0
I     NaN
Name: coluna1, dtype: float64

In [115]:
# pega o valor de cima e aplica para os de baixo
df['coluna1'].fillna(method='ffill')

A    19.0
B    19.0
C    62.0
D    62.0
E    62.0
F    12.0
G    21.0
H    28.0
I    28.0
Name: coluna1, dtype: float64

In [116]:
# pega o valor de baixo e aplica para os de cima
df['coluna1'].fillna(method='bfill')

A    19.0
B    62.0
C    62.0
D    12.0
E    12.0
F    12.0
G    21.0
H    28.0
I     NaN
Name: coluna1, dtype: float64

### Dropando as linhas com missing

In [120]:
# dropando todas as linhas que contem valores NaN
df.dropna()

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,19.0,5.334578,42.255190796214734,17.274552
C,62.0,14.712186,3.429024060428465,97.32316
F,12.0,83.181236,7.990770106266865,17.765918
H,28.0,14.235639,61.72276021602391,36.772142


### Dados Duplicados

In [125]:
# criando um dataframe com dados duplicados

df_dup = df.append(df.loc['D': 'H',:]).sort_index()
df_dup

  df_dup = df.append(df.loc['D': 'H',:]).sort_index()


Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,19.0,5.334578,42.255190796214734,17.274552
B,,75.997894,66.28674306770573,50.555908
C,62.0,14.712186,3.429024060428465,97.32316
D,,71.14991,35.51254489778491,28.286585
D,,71.14991,35.51254489778491,28.286585
E,,56.793323,,
E,,56.793323,,
F,12.0,83.181236,7.990770106266865,17.765918
F,12.0,83.181236,7.990770106266865,17.765918
G,21.0,77.891506,49.61628656373127,


In [128]:
# dropando todas as linhas duplicadas
df_dup.drop_duplicates()

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,19.0,5.334578,42.255190796214734,17.274552
B,,75.997894,66.28674306770573,50.555908
C,62.0,14.712186,3.429024060428465,97.32316
D,,71.14991,35.51254489778491,28.286585
E,,56.793323,,
F,12.0,83.181236,7.990770106266865,17.765918
G,21.0,77.891506,49.61628656373127,
H,28.0,14.235639,61.72276021602391,36.772142
I,,1.565151,69.06079187224358,


In [130]:
# dropando valores duplicados por analisando uma coluna
df_dup.drop_duplicates(subset=['coluna1'])

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,19.0,5.334578,42.255190796214734,17.274552
B,,75.997894,66.28674306770573,50.555908
C,62.0,14.712186,3.429024060428465,97.32316
F,12.0,83.181236,7.990770106266865,17.765918
G,21.0,77.891506,49.61628656373127,
H,28.0,14.235639,61.72276021602391,36.772142


### Mapeamentos (map)

In [144]:
# lista randomica de 0 ou 1 de quinze digitos

rand_list = [np.random.randint(0, 2) for i in range(0, 15)]

In [145]:
genero = pd.Series(rand_list)
genero

0     1
1     1
2     1
3     1
4     1
5     0
6     1
7     1
8     0
9     0
10    1
11    0
12    1
13    0
14    0
dtype: int64

In [146]:
# passa um dicionario para a série e troca os valores mapeados para o valor escolhido
genero.map({1: 'Feminino', 0: 'Masculino'})

0      Feminino
1      Feminino
2      Feminino
3      Feminino
4      Feminino
5     Masculino
6      Feminino
7      Feminino
8     Masculino
9     Masculino
10     Feminino
11    Masculino
12     Feminino
13    Masculino
14    Masculino
dtype: object

## 5.2 Renomeando indices e colunas