In [42]:
# Importando as bibliotecas necessárias
import os
import sys

import pandas as pd

sys.path.append(os.path.abspath("../../"))
from pipe.preprocess.utils import to_snake_case, clean_whitespace

In [43]:
# Configurar pandas para exibir todas as colunas
pd.set_option('display.max_columns', None)

---

# Carregar data set

In [44]:
df_geocode = pd.read_csv('../../data/raw/df_geocode.csv')
df_geocode.head()

Unnamed: 0.1,Unnamed: 0,Lab Id,Address,Location,Zipcode
0,0,L152,"3800 PLEASANT HILL RD STE 1, DULUTH, GA 30096","34.000319,-84.1629724",30096.0
1,1,L520,"1614 N JAMES ST, ROME, NY 13440","43.2311327,-75.4445363",13440.0
2,2,L141,"12911 120TH AVE NE STE D60, KIRKLAND, WA 98034","47.7162786,-122.1838152",98034.0
3,3,L524,"5667 PEACHTREE DUNWOODY RD 250, ATLANTA, GA 30342","33.9093875,-84.3529096",30342.0
4,4,L545,"1204 IL HWY 164, OQUAWKA, IL 61469","40.9309925,-90.9437598",61469.0


In [45]:
# Deletar a coluna 'Unnamed: 0' (apenas é um índice)
df_geocode.drop(columns=['Unnamed: 0'], inplace=True)

---

# Verificar valores duplicados

In [46]:
# Verificando a quantidade de linhas duplicadas
df_geocode.duplicated().sum()

np.int64(0)

---

# Analisar os tipos de dados

In [47]:
df_geocode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Lab Id    119 non-null    object 
 1   Address   119 non-null    object 
 2   Location  119 non-null    object 
 3   Zipcode   118 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.8+ KB


- `Zipcode` deve ser transformada para `str` por se tratar de uma variável de identificação

In [48]:
# Antes de realiza a transformação, verificar se há valores nulos
df_geocode.isnull().sum()

Lab Id      0
Address     0
Location    0
Zipcode     1
dtype: int64

In [49]:
# Verificando valor nulo
df_geocode[df_geocode['Zipcode'].isnull()]

Unnamed: 0,Lab Id,Address,Location,Zipcode
83,-2,Unavailable,Unavailable,


In [50]:
# Valores não fazem sentido, então deleta-se
df_geocode = df_geocode[~df_geocode['Zipcode'].isnull()]

In [51]:
# Validando
df_geocode.isnull().sum()

Lab Id      0
Address     0
Location    0
Zipcode     0
dtype: int64

In [52]:
# Transformando zipcode para string
df_geocode['Zipcode'] = df_geocode['Zipcode'].astype(str).str.replace('.0', '')

In [53]:
# Verificando novamente o tipo de dados
df_geocode.info()

<class 'pandas.core.frame.DataFrame'>
Index: 118 entries, 0 to 118
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Lab Id    118 non-null    object
 1   Address   118 non-null    object
 2   Location  118 non-null    object
 3   Zipcode   118 non-null    object
dtypes: object(4)
memory usage: 4.6+ KB


---

# Organizar os dados

In [54]:
# Separa a latitude e longitude em colunas separadas e transformar para float
df_geocode[['latitude', 'longitude']] = df_geocode['Location'].str.split(',', expand=True)

# Deletando a coluna Location, sem mais necessidade
df_geocode.drop(columns=['Location'], inplace=True)

df_geocode.head()

Unnamed: 0,Lab Id,Address,Zipcode,latitude,longitude
0,L152,"3800 PLEASANT HILL RD STE 1, DULUTH, GA 30096",30096,34.000319,-84.1629724
1,L520,"1614 N JAMES ST, ROME, NY 13440",13440,43.2311327,-75.4445363
2,L141,"12911 120TH AVE NE STE D60, KIRKLAND, WA 98034",98034,47.7162786,-122.1838152
3,L524,"5667 PEACHTREE DUNWOODY RD 250, ATLANTA, GA 30342",30342,33.9093875,-84.3529096
4,L545,"1204 IL HWY 164, OQUAWKA, IL 61469",61469,40.9309925,-90.9437598


In [55]:
# Separar, também, o endereço entre rua, cidade e estado-zipcode 
df_geocode[['street', 'city', 'state_zipcode']] = df_geocode['Address'].str.split(',', expand=True)

# Separrar o estado e o zipcode
df_geocode[['state', '_zipcode']] = df_geocode['state_zipcode'].str.strip().str.split(' ', expand=True)

# Deletar a coluna de _zipcode, Address e state_zipcode (sem mais necessidade)
df_geocode.drop(columns=['Address', 'state_zipcode', '_zipcode'], inplace=True)

df_geocode.head()

Unnamed: 0,Lab Id,Zipcode,latitude,longitude,street,city,state
0,L152,30096,34.000319,-84.1629724,3800 PLEASANT HILL RD STE 1,DULUTH,GA
1,L520,13440,43.2311327,-75.4445363,1614 N JAMES ST,ROME,NY
2,L141,98034,47.7162786,-122.1838152,12911 120TH AVE NE STE D60,KIRKLAND,WA
3,L524,30342,33.9093875,-84.3529096,5667 PEACHTREE DUNWOODY RD 250,ATLANTA,GA
4,L545,61469,40.9309925,-90.9437598,1204 IL HWY 164,OQUAWKA,IL


In [56]:
# Verificando a organização
df_geocode.info()

<class 'pandas.core.frame.DataFrame'>
Index: 118 entries, 0 to 118
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Lab Id     118 non-null    object
 1   Zipcode    118 non-null    object
 2   latitude   118 non-null    object
 3   longitude  118 non-null    object
 4   street     118 non-null    object
 5   city       118 non-null    object
 6   state      118 non-null    object
dtypes: object(7)
memory usage: 7.4+ KB


---

# Remover outliers

In [57]:
df_geocode.describe()

Unnamed: 0,Lab Id,Zipcode,latitude,longitude,street,city,state
count,118,118,118.0,118.0,118,118,118
unique,118,105,116.0,116.0,118,79,13
top,L152,98034,37.7754429,-121.9669886,3800 PLEASANT HILL RD STE 1,PHILADELPHIA,WA
freq,1,3,2.0,2.0,1,13,20


Como o data set representa a localização de cada laboratório, não tem muita aplicabilidade a remoção de outliers

---

# Transformações Gerais

In [58]:
# Transformando as colunas em snake_case
to_snake_case(df_geocode)

# Limpando os espaços em branco
clean_whitespace(df_geocode)

In [59]:
df_geocode.head()

Unnamed: 0,lab_id,zipcode,latitude,longitude,street,city,state
0,L152,30096,34.000319,-84.1629724,3800 PLEASANT HILL RD STE 1,DULUTH,GA
1,L520,13440,43.2311327,-75.4445363,1614 N JAMES ST,ROME,NY
2,L141,98034,47.7162786,-122.1838152,12911 120TH AVE NE STE D60,KIRKLAND,WA
3,L524,30342,33.9093875,-84.3529096,5667 PEACHTREE DUNWOODY RD 250,ATLANTA,GA
4,L545,61469,40.9309925,-90.9437598,1204 IL HWY 164,OQUAWKA,IL


---

# Validação Final

In [60]:
df_geocode.describe(include='all')

Unnamed: 0,lab_id,zipcode,latitude,longitude,street,city,state
count,118,118,118.0,118.0,118,118,118
unique,118,105,116.0,116.0,118,79,13
top,L152,98034,37.7754429,-121.9669886,3800 PLEASANT HILL RD STE 1,PHILADELPHIA,WA
freq,1,3,2.0,2.0,1,13,20


---

# Exportar data set limpo

In [61]:
df_geocode.to_csv('../../data/processed/geocode_data_clean.csv', index=False)