In [None]:
# Importando as bibliotecas necessárias
import os
import sys

import pandas as pd

sys.path.append(os.path.abspath("../../"))
from scripts.utils import to_snake_case, clean_whitespace

In [2]:
# Configurar pandas para exibir todas as colunas
pd.set_option('display.max_columns', None)

---

# Carregar data set

In [None]:
df_transactional = pd.read_csv('../../data/raw/transactional_data.csv')
df_transactional.head()

Unnamed: 0,Patient Id;Gender;Date of birth;Date of service;Service Id;Lab Id;CodItem;Testing Cost
10210830256-BIO003;F;01/08/1976 00:00:00;2019-01-07;571904533475-38;L133;70003237;9,0
10210830256-BIO003;F;01/08/1976 00:00:00;2019-01-07;571904533475-38;L133;70000638;13,0
10210830256-BIO003;F;01/08/1976 00:00:00;2019-01-07;571904533475-38;L133;70001597;49,0
10210830256-BIO003;F;01/08/1976 00:00:00;2019-01-07;571904533475-38;L133;70000103;11,0
10210830256-BIO003;F;01/08/1976 00:00:00;2019-01-07;571904533475-38;L133;70000224;10,0


In [None]:
# Como o separador do csv é ;, deve-se especificar isso ao ler o arquivo
df_transactional = pd.read_csv(f'../../data/raw/transactional_data.csv', sep=';')
df_transactional.head()

Unnamed: 0,Patient Id,Gender,Date of birth,Date of service,Service Id,Lab Id,CodItem,Testing Cost
0,10210830256-BIO003,F,01/08/1976 00:00:00,2019-01-07,571904533475-38,L133,70003237,90
1,10210830256-BIO003,F,01/08/1976 00:00:00,2019-01-07,571904533475-38,L133,70000638,130
2,10210830256-BIO003,F,01/08/1976 00:00:00,2019-01-07,571904533475-38,L133,70001597,490
3,10210830256-BIO003,F,01/08/1976 00:00:00,2019-01-07,571904533475-38,L133,70000103,110
4,10210830256-BIO003,F,01/08/1976 00:00:00,2019-01-07,571904533475-38,L133,70000224,100


---

# Verificar valores duplicados

In [5]:
# Verificando a quantidade de linhas duplicadas
df_transactional.duplicated().sum()

np.int64(14519)

Como o método utilizado para verificar a quantidade de linha duplicadas foi o `duplicated()`, infere-se que todos os valores de todas as colunas são iguais. Portannto pode-se deletar as linhas duplicadas

In [6]:
# Deletando as linhas duplicadas
df_transactional.drop_duplicates(inplace=True)

In [7]:
# Verificando novamente a quantidade de linhas duplicadas
df_transactional.duplicated().sum()

np.int64(0)

---

# Analisar os tipos de dados

In [8]:
df_transactional.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2340722 entries, 0 to 2355240
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   Patient Id       object
 1   Gender           object
 2   Date of birth    object
 3   Date of service  object
 4   Service Id       object
 5   Lab Id           object
 6   CodItem          int64 
 7   Testing Cost     object
dtypes: int64(1), object(7)
memory usage: 160.7+ MB


- `Date of birth`, `Date of birth` devem ser transformados para `datetime`
- `Testing Cost` deve ser transformado para `float`

In [9]:
# Realizando as transformações necessárias
df_transactional['Date of birth'] = pd.to_datetime(df_transactional['Date of birth'], format='%d/%m/%Y %H:%M:%S')
df_transactional['Date of service'] = pd.to_datetime(df_transactional['Date of service'], format='%Y-%m-%d')
df_transactional['Testing Cost'] = df_transactional['Testing Cost'].str.replace(',', '.').astype(float)

In [10]:
# Verificando novamente o tipo de dados
df_transactional.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2340722 entries, 0 to 2355240
Data columns (total 8 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Patient Id       object        
 1   Gender           object        
 2   Date of birth    datetime64[ns]
 3   Date of service  datetime64[ns]
 4   Service Id       object        
 5   Lab Id           object        
 6   CodItem          int64         
 7   Testing Cost     float64       
dtypes: datetime64[ns](2), float64(1), int64(1), object(4)
memory usage: 160.7+ MB


---

# Verificar valores nulos e ausentes

In [11]:
# Verificando a quantidade de valores nulos
df_transactional.isnull().sum()

Patient Id         0
Gender             0
Date of birth      1
Date of service    0
Service Id         0
Lab Id             0
CodItem            0
Testing Cost       0
dtype: int64

In [12]:
# Verificando a quantidade de valores ausentes
df_transactional.isna().sum()

Patient Id         0
Gender             0
Date of birth      1
Date of service    0
Service Id         0
Lab Id             0
CodItem            0
Testing Cost       0
dtype: int64

In [13]:
# Verificando o valor nulo
df_transactional[df_transactional['Date of birth'].isnull()]

Unnamed: 0,Patient Id,Gender,Date of birth,Date of service,Service Id,Lab Id,CodItem,Testing Cost
1253947,7659985004-1,F,NaT,2020-02-22,7659985004-2-1,L152,70000107,48.0


Como o dado aparenta ser relavante, pode-se verificar se o paciente ja realizou outro exame e se tem a data de aniversário

In [14]:
# Verificando se o paciente realizou algum outro exame
df_transactional[df_transactional['Patient Id'] == '7659985004-1']

Unnamed: 0,Patient Id,Gender,Date of birth,Date of service,Service Id,Lab Id,CodItem,Testing Cost
1253947,7659985004-1,F,NaT,2020-02-22,7659985004-2-1,L152,70000107,48.0


In [15]:
# Como não realizou, pode-se preencher como a meadiana dos outros pacientes
df_transactional['Date of birth'] = df_transactional['Date of birth'].fillna(df_transactional['Date of birth'].median())

In [16]:
# Verificando valor
df_transactional[df_transactional['Patient Id'] == '7659985004-1']

Unnamed: 0,Patient Id,Gender,Date of birth,Date of service,Service Id,Lab Id,CodItem,Testing Cost
1253947,7659985004-1,F,1977-08-11,2020-02-22,7659985004-2-1,L152,70000107,48.0


---

# Transformações Gerais

In [17]:
# Transformando as colunas em snake_case
to_snake_case(df_transactional)

# Limpando os espaços em branco
clean_whitespace(df_transactional)

---

# Validação Final

In [18]:
df_transactional.describe(include='all')

Unnamed: 0,patient__id,gender,date_of_birth,date_of_service,service__id,lab__id,cod_item,testing__cost
count,2340722,2340722,2340722,2340722,2340722,2340722,2340722.0,2340722.0
unique,501446,3,,,660059,118,,
top,9999999-1,F,,,6004395771-8-1,L133,,
freq,1409,1491030,,,87,121886,,
mean,,,1978-01-24 18:56:22.388681664,2020-01-12 14:34:25.793375744,,,70003220.0,51.10238
min,,,1859-12-28 00:00:00,2019-01-01 00:00:00,,,70000040.0,0.0
25%,,,1965-04-06 00:00:00,2019-07-11 00:00:00,,,70001130.0,11.0
50%,,,1977-08-11 00:00:00,2020-01-15 00:00:00,,,70003760.0,24.0
75%,,,1989-08-05 00:00:00,2020-07-14 00:00:00,,,70004460.0,49.0
max,,,2020-02-08 00:00:00,2021-02-12 00:00:00,,,70009000.0,9500.0


---

# Exportar data set limpo

In [19]:
df_transactional.to_csv('../../data/interim/transactional_data_clean.csv', index=False)