# Module 5 - Limpeza e preparação de dados

## 5.1 Identificando e tratando dados ausentes

### Identificando dados ausentes

In [70]:
import pandas as pd
import numpy as np

In [71]:
# construindo um dataframe
df = pd.DataFrame(
    np.random.rand(9, 4) * 100,
    index=[chr(i) for i in range(ord("A"), ord("J"))],
    columns=["coluna%s" %i for i in range(1, 5)]
)
df_2 = df.copy()
df

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,99.179672,14.672962,57.490352,15.806837
B,14.213093,52.069113,37.434942,55.685979
C,73.565413,61.725587,86.465914,15.099232
D,25.293236,62.272979,33.775578,14.414189
E,17.667669,44.66596,22.983736,56.998505
F,15.206993,98.756317,2.321883,48.221539
G,81.134411,97.625692,27.721562,19.154007
H,80.141295,27.992726,51.985793,57.619315
I,69.61865,81.051657,29.704625,92.310521


In [72]:
# tipos das colunas
df.dtypes

coluna1    float64
coluna2    float64
coluna3    float64
coluna4    float64
dtype: object

In [73]:
df['coluna1'] = df['coluna1'].astype(int)
df.dtypes

coluna1      int64
coluna2    float64
coluna3    float64
coluna4    float64
dtype: object

In [74]:
df['coluna3'] = df['coluna3'].astype(str)
df.dtypes

coluna1      int64
coluna2    float64
coluna3     object
coluna4    float64
dtype: object

In [75]:
df

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,99,14.672962,57.49035237811946,15.806837
B,14,52.069113,37.43494182300133,55.685979
C,73,61.725587,86.46591445211101,15.099232
D,25,62.272979,33.77557806831292,14.414189
E,17,44.66596,22.983735705888154,56.998505
F,15,98.756317,2.321882855099622,48.221539
G,81,97.625692,27.721561978687724,19.154007
H,80,27.992726,51.98579348728204,57.619315
I,69,81.051657,29.70462523817225,92.310521


In [76]:
# acessando valor por linha e coluna
df.iloc[4, 2]

'22.983735705888154'

In [77]:
# inserindo um dado faltante no dataframe
df.iloc[1, 0] = np.nan
df.iloc[4, 0] = np.nan
df.iloc[3, 0] = np.nan
df.iloc[8, 0] = np.nan
df.iloc[6, 3] = np.nan
df.iloc[4, 3] = np.nan

In [78]:
# NaN = Not a Number
df

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,99.0,14.672962,57.49035237811946,15.806837
B,,52.069113,37.43494182300133,55.685979
C,73.0,61.725587,86.46591445211101,15.099232
D,,62.272979,33.77557806831292,14.414189
E,,44.66596,22.983735705888154,
F,15.0,98.756317,2.321882855099622,48.221539
G,81.0,97.625692,27.721561978687724,
H,80.0,27.992726,51.98579348728204,57.619315
I,,81.051657,29.70462523817225,92.310521


In [79]:
# identificando valores faltantes
df.isna()
df.isnull()

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,False,False,False,False
B,True,False,False,False
C,False,False,False,False
D,True,False,False,False
E,True,False,False,True
F,False,False,False,False
G,False,False,False,True
H,False,False,False,False
I,True,False,False,False


In [80]:
# identificando apenas de uma coluna
df.coluna1.isna()

A    False
B     True
C    False
D     True
E     True
F    False
G    False
H    False
I     True
Name: coluna1, dtype: bool

In [81]:
# trazer todas as linhas onde a coluna é NaN
df[df.coluna1.isna()]

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
B,,52.069113,37.43494182300133,55.685979
D,,62.272979,33.77557806831292,14.414189
E,,44.66596,22.983735705888154,
I,,81.051657,29.70462523817225,92.310521


In [82]:
# trazer todas as linhas onde a coluna não é NaN
df[~df.coluna1.isna()]

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,99.0,14.672962,57.49035237811946,15.806837
C,73.0,61.725587,86.46591445211101,15.099232
F,15.0,98.756317,2.321882855099622,48.221539
G,81.0,97.625692,27.721561978687724,
H,80.0,27.992726,51.98579348728204,57.619315


In [83]:
# identificando a quantidade de valores NaN em uma coluna
df.coluna1.isna().sum()

4

In [84]:
# identificando a quantidade de valores NaN no dataframe
# true = 1
# false = 0
df.isna().sum()

coluna1    4
coluna2    0
coluna3    0
coluna4    2
dtype: int64

In [85]:
# len(df.index) -> quantidade de linhas de um dataframe
# cáculo para saber a porcentagem de valores faltantes em cada coluna

df.isna().sum() / len(df.index)

coluna1    0.444444
coluna2    0.000000
coluna3    0.000000
coluna4    0.222222
dtype: float64

### Tratando dados ausentes

In [86]:
df['coluna1']

A    99.0
B     NaN
C    73.0
D     NaN
E     NaN
F    15.0
G    81.0
H    80.0
I     NaN
Name: coluna1, dtype: float64

In [87]:
# subistituindo por zero

df['coluna1'].fillna(0)

A    99.0
B     0.0
C    73.0
D     0.0
E     0.0
F    15.0
G    81.0
H    80.0
I     0.0
Name: coluna1, dtype: float64

In [88]:
# subistituindo pela media
mean = df['coluna1'].mean() # média

df['coluna1'].fillna(mean)

A    99.0
B    69.6
C    73.0
D    69.6
E    69.6
F    15.0
G    81.0
H    80.0
I    69.6
Name: coluna1, dtype: float64

In [89]:
# subistituindo pela mediana

median = df['coluna1'].sort_values().median()

df['coluna1'].fillna(median)

A    99.0
B    80.0
C    73.0
D    80.0
E    80.0
F    15.0
G    81.0
H    80.0
I    80.0
Name: coluna1, dtype: float64

### Subistituindo usando ffill e o bfill

In [90]:
df['coluna1']

A    99.0
B     NaN
C    73.0
D     NaN
E     NaN
F    15.0
G    81.0
H    80.0
I     NaN
Name: coluna1, dtype: float64

In [91]:
# pega o valor de cima e aplica para os de baixo
df['coluna1'].fillna(method='ffill')

A    99.0
B    99.0
C    73.0
D    73.0
E    73.0
F    15.0
G    81.0
H    80.0
I    80.0
Name: coluna1, dtype: float64

In [92]:
# pega o valor de baixo e aplica para os de cima
df['coluna1'].fillna(method='bfill')

A    99.0
B    73.0
C    73.0
D    15.0
E    15.0
F    15.0
G    81.0
H    80.0
I     NaN
Name: coluna1, dtype: float64

### Dropando as linhas com missing

In [93]:
# dropando todas as linhas que contem valores NaN
df.dropna()

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,99.0,14.672962,57.49035237811946,15.806837
C,73.0,61.725587,86.46591445211101,15.099232
F,15.0,98.756317,2.321882855099622,48.221539
H,80.0,27.992726,51.98579348728204,57.619315


### Dados Duplicados

In [94]:
# criando um dataframe com dados duplicados

df_dup = df.append(df.loc['D': 'H',:]).sort_index()
df_dup

  df_dup = df.append(df.loc['D': 'H',:]).sort_index()


Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,99.0,14.672962,57.49035237811946,15.806837
B,,52.069113,37.43494182300133,55.685979
C,73.0,61.725587,86.46591445211101,15.099232
D,,62.272979,33.77557806831292,14.414189
D,,62.272979,33.77557806831292,14.414189
E,,44.66596,22.983735705888154,
E,,44.66596,22.983735705888154,
F,15.0,98.756317,2.321882855099622,48.221539
F,15.0,98.756317,2.321882855099622,48.221539
G,81.0,97.625692,27.721561978687724,


In [95]:
# dropando todas as linhas duplicadas
df_dup.drop_duplicates()

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,99.0,14.672962,57.49035237811946,15.806837
B,,52.069113,37.43494182300133,55.685979
C,73.0,61.725587,86.46591445211101,15.099232
D,,62.272979,33.77557806831292,14.414189
E,,44.66596,22.983735705888154,
F,15.0,98.756317,2.321882855099622,48.221539
G,81.0,97.625692,27.721561978687724,
H,80.0,27.992726,51.98579348728204,57.619315
I,,81.051657,29.70462523817225,92.310521


In [96]:
# dropando valores duplicados por analisando uma coluna
df_dup.drop_duplicates(subset=['coluna1'])

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,99.0,14.672962,57.49035237811946,15.806837
B,,52.069113,37.43494182300133,55.685979
C,73.0,61.725587,86.46591445211101,15.099232
F,15.0,98.756317,2.321882855099622,48.221539
G,81.0,97.625692,27.721561978687724,
H,80.0,27.992726,51.98579348728204,57.619315


### Mapeamentos (map)

In [97]:
# lista randomica de 0 ou 1 de quinze digitos

rand_list = [np.random.randint(0, 2) for i in range(0, 15)]

In [98]:
genero = pd.Series(rand_list)
genero

0     1
1     0
2     1
3     1
4     0
5     0
6     0
7     0
8     0
9     1
10    0
11    0
12    0
13    1
14    1
dtype: int64

In [99]:
# passa um dicionario para a série e troca os valores mapeados para o valor escolhido
genero.map({1: 'Feminino', 0: 'Masculino'})

0      Feminino
1     Masculino
2      Feminino
3      Feminino
4     Masculino
5     Masculino
6     Masculino
7     Masculino
8     Masculino
9      Feminino
10    Masculino
11    Masculino
12    Masculino
13     Feminino
14     Feminino
dtype: object

## 5.2 Renomeando indices e colunas

### Renomeando colunas

In [102]:
# lista de colunas
df_2.columns

Index(['coluna1', 'coluna2', 'coluna3', 'coluna4'], dtype='object')

In [104]:
df_2.columns = ['segunda', 'terca', 'quarta', 'quinta']
df_2

Unnamed: 0,segunda,terca,quarta,quinta
A,99.179672,14.672962,57.490352,15.806837
B,14.213093,52.069113,37.434942,55.685979
C,73.565413,61.725587,86.465914,15.099232
D,25.293236,62.272979,33.775578,14.414189
E,17.667669,44.66596,22.983736,56.998505
F,15.206993,98.756317,2.321883,48.221539
G,81.134411,97.625692,27.721562,19.154007
H,80.141295,27.992726,51.985793,57.619315
I,69.61865,81.051657,29.704625,92.310521


### Renomeando indices

In [109]:
# criando a coluna com os novos indices
list_index = ["LETRA_" + chr(i) for i in range(ord("A"), ord("J"))]

df_2['coluna_index'] = list_index
df_2

Unnamed: 0,segunda,terca,quarta,quinta,coluna_index
A,99.179672,14.672962,57.490352,15.806837,LETRA_A
B,14.213093,52.069113,37.434942,55.685979,LETRA_B
C,73.565413,61.725587,86.465914,15.099232,LETRA_C
D,25.293236,62.272979,33.775578,14.414189,LETRA_D
E,17.667669,44.66596,22.983736,56.998505,LETRA_E
F,15.206993,98.756317,2.321883,48.221539,LETRA_F
G,81.134411,97.625692,27.721562,19.154007,LETRA_G
H,80.141295,27.992726,51.985793,57.619315,LETRA_H
I,69.61865,81.051657,29.704625,92.310521,LETRA_I


In [113]:
# setando a coluna criada como a coluna index
df_2.set_index('coluna_index')

Unnamed: 0_level_0,segunda,terca,quarta,quinta
coluna_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LETRA_A,99.179672,14.672962,57.490352,15.806837
LETRA_B,14.213093,52.069113,37.434942,55.685979
LETRA_C,73.565413,61.725587,86.465914,15.099232
LETRA_D,25.293236,62.272979,33.775578,14.414189
LETRA_E,17.667669,44.66596,22.983736,56.998505
LETRA_F,15.206993,98.756317,2.321883,48.221539
LETRA_G,81.134411,97.625692,27.721562,19.154007
LETRA_H,80.141295,27.992726,51.985793,57.619315
LETRA_I,69.61865,81.051657,29.704625,92.310521


In [117]:
# reseta o index e dropa a coluna que era index antes
df_2.reset_index(drop=True)

Unnamed: 0,segunda,terca,quarta,quinta,coluna_index
0,99.179672,14.672962,57.490352,15.806837,LETRA_A
1,14.213093,52.069113,37.434942,55.685979,LETRA_B
2,73.565413,61.725587,86.465914,15.099232,LETRA_C
3,25.293236,62.272979,33.775578,14.414189,LETRA_D
4,17.667669,44.66596,22.983736,56.998505,LETRA_E
5,15.206993,98.756317,2.321883,48.221539,LETRA_F
6,81.134411,97.625692,27.721562,19.154007,LETRA_G
7,80.141295,27.992726,51.985793,57.619315,LETRA_H
8,69.61865,81.051657,29.704625,92.310521,LETRA_I


## 5.3 Categorização e dummies

### Categorização