# Montagem do Google Drive e importação do Pandas

In [21]:
import pandas

# Visualização dos dados

In [22]:
df = pandas.read_csv("data-aulapandas/sport-activity2.csv")

df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


# Dados incorretos

Eventualmente, os dados podem não ser nem nulos, nem no formato incorreto. Eles simplesmente estão com valores incorretos seja por erro de digitação ou algum outro fator.

Ao olhar a linha 7 coluna 'Duration', nota-se que há o valor 450, contudo de acordo com o conjunto de dados da coluna esse valor deveria ser 45.

Logo, o ajuste desse valor incorreto pode ser realizado por meio de:

In [23]:
df.loc[7, 'Duration'] = 45

df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,45,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


Em casos em que o dataset é pequeno, é possível realizar a substituição manual conforme o código anterior.

Contudo, para datasets maiores, pode-se utilizar regras de substituição de valores:

In [24]:
df = pandas.read_csv("data-aulapandas/sport-activity2.csv")

for x in df.index:
  if df.loc[x, "Pulse"] > 100:
    df.loc[x, "Pulse"] = 100

df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',100,130,409.1
1,60,'2020/12/02',100,145,479.0
2,60,'2020/12/03',100,135,340.0
3,45,'2020/12/04',100,175,282.4
4,45,'2020/12/05',100,148,406.0
5,60,'2020/12/06',100,127,300.0
6,60,'2020/12/07',100,136,374.0
7,450,'2020/12/08',100,134,253.3
8,30,'2020/12/09',100,133,195.1
9,60,'2020/12/10',98,124,269.0


ou pode-se apagar a linha

In [25]:
df = pandas.read_csv("data-aulapandas/sport-activity2.csv")

for x in df.index:
  if df.loc[x, "Pulse"] > 100 :
    df.drop(x, inplace=True)

df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
9,60,'2020/12/10',98,124,269.0
11,60,'2020/12/12',100,120,250.7
12,60,'2020/12/12',100,120,250.7
15,60,'2020/12/15',98,123,275.0
16,60,'2020/12/16',98,120,215.2
17,60,'2020/12/17',100,120,300.0
18,45,'2020/12/18',90,112,
20,45,'2020/12/20',97,125,243.0
22,45,,100,119,282.0
26,60,20201226,100,120,250.0


#  Remoção de linhas duplicadas

Para identificar registros duplicados, o método **duplicated()** de um DataFrame retorna linhas idênticas:

In [26]:
df = pandas.read_csv("data-aulapandas/sport-activity2.csv")

print("----------- Original DataFrame -----------\n")
print(df)

print("\n----------- Duplicated Rows -----------\n")
print(df.duplicated())

----------- Original DataFrame -----------

    Duration          Date  Pulse  Maxpulse  Calories
0         60  '2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45  '2020/12/05'    117       148     406.0
5         60  '2020/12/06'    102       127     300.0
6         60  '2020/12/07'    110       136     374.0
7        450  '2020/12/08'    104       134     253.3
8         30  '2020/12/09'    109       133     195.1
9         60  '2020/12/10'     98       124     269.0
10        60  '2020/12/11'    103       147     329.3
11        60  '2020/12/12'    100       120     250.7
12        60  '2020/12/12'    100       120     250.7
13        60  '2020/12/13'    106       128     345.3
14        60  '2020/12/14'    104       132     379.3
15        60  '2020/12/15'     98       123     275.0
16        60  '2020/12/16'     98     

# Exemplo de adição de uma nova coluna com a informação de duplicados

In [27]:
df = pandas.read_csv("data-aulapandas/sport-activity2.csv")
df['Duplicated'] = df.duplicated()

df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories,Duplicated
0,60,'2020/12/01',110,130,409.1,False
1,60,'2020/12/02',117,145,479.0,False
2,60,'2020/12/03',103,135,340.0,False
3,45,'2020/12/04',109,175,282.4,False
4,45,'2020/12/05',117,148,406.0,False
5,60,'2020/12/06',102,127,300.0,False
6,60,'2020/12/07',110,136,374.0,False
7,450,'2020/12/08',104,134,253.3,False
8,30,'2020/12/09',109,133,195.1,False
9,60,'2020/12/10',98,124,269.0,False


# Removendo linhas duplicadas baseado na nova coluna criada

In [28]:
for x in df.index:
    if df.loc[x, 'Duplicated'] == True:
        df.drop(x, inplace=True)

df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories,Duplicated
0,60,'2020/12/01',110,130,409.1,False
1,60,'2020/12/02',117,145,479.0,False
2,60,'2020/12/03',103,135,340.0,False
3,45,'2020/12/04',109,175,282.4,False
4,45,'2020/12/05',117,148,406.0,False
5,60,'2020/12/06',102,127,300.0,False
6,60,'2020/12/07',110,136,374.0,False
7,450,'2020/12/08',104,134,253.3,False
8,30,'2020/12/09',109,133,195.1,False
9,60,'2020/12/10',98,124,269.0,False


A remoção das linhas duplicadas pode ser realizada por meio do método **drop_duplicates()**:

In [29]:
df = pandas.read_csv("data-aulapandas/sport-activity2.csv")
df.drop_duplicates(inplace=True)

df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0
