In [None]:
# Importando as bibliotecas necessárias
import os
import sys

import pandas as pd

sys.path.append(os.path.abspath("../../"))
from scripts.utils import to_snake_case, clean_whitespace

In [2]:
# Configurar pandas para exibir todas as colunas
pd.set_option('display.max_columns', None)

---

# Carregar data set

In [3]:
df_exams = pd.read_csv('../../data/raw/exams_data.csv')
df_exams.head()

Unnamed: 0,CodItem,Desc Item,Category,Family,Speciality,Testing Cost
0,70003237,"MAGNESIO, SORO (EXEMPLO DE EXPRESSAO DE RESULT...",CA,CORELAB,BIOCHEMISTRY,1.78
1,70000638,"HEMOGRAMA, AUTOMATIZADO, SANGUE",CA,HEMATO,BLOOD COUNT,2.46
2,70001597,"FERRITINA, SORO",CA,CORELAB,IMMUNOHORMONE,2.11
3,70000103,"FERRO, DOSAGEM, SORO (EXEMPLO DE EXPRESSAO DE ...",CA,CORELAB,BIOCHEMISTRY,0.8
4,70000224,"CALCIO, DOSAGEM, SORO (EXEMPLO DE EXPRESSAO DE...",CA,CORELAB,BIOCHEMISTRY,1.02


---

# Verificar valores duplicados

In [4]:
# Verificando a quantidade de linhas duplicadas
df_exams.duplicated().sum()

np.int64(0)

---

# Analisar os tipos de dados

In [5]:
df_exams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2002 entries, 0 to 2001
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CodItem       2002 non-null   int64  
 1   Desc Item     2002 non-null   object 
 2   Category      2002 non-null   object 
 3   Family        2002 non-null   object 
 4   Speciality    2002 non-null   object 
 5   Testing Cost  2002 non-null   float64
dtypes: float64(1), int64(1), object(4)
memory usage: 94.0+ KB


- A coluna `CodItem` por se tratar de um identificador é ideal que seja uma string.

In [6]:
df_exams['CodItem'] = df_exams['CodItem'].astype(str)

---

# Verificar valores nulos e ausentes

In [7]:
# Verificando a quantidade de valores nulos
df_exams.isnull().sum()

CodItem         0
Desc Item       0
Category        0
Family          0
Speciality      0
Testing Cost    0
dtype: int64

In [8]:
# Verificando a quantidade de valores ausentes
df_exams.isna().sum()

CodItem         0
Desc Item       0
Category        0
Family          0
Speciality      0
Testing Cost    0
dtype: int64

---

# Transformações Gerais

In [9]:
# Transformando as colunas em snake_case
to_snake_case(df_exams)

# Limpando os espaços em branco
clean_whitespace(df_exams)

---

# Validação Final

In [10]:
df_exams.describe(include='all')

Unnamed: 0,cod_item,desc__item,category,family,speciality,testing__cost
count,2002.0,2002,2002,2002,2002,2002.0
unique,2001.0,1997,7,23,77,
top,70004701.0,"PLAQUETAS, ANTICORPOS IGG E IGM, SORO",CA,CORELAB,IMMUNOHORMONE,
freq,2.0,2,1578,990,472,
mean,,,,,,77.767453
std,,,,,,146.066382
min,,,,,,0.0
25%,,,,,,4.8125
50%,,,,,,21.19
75%,,,,,,79.27


---

# Exportar data set limpo

In [11]:
df_exams.to_csv('../../data/interim/exams_data_clean.csv', index=False)