In [31]:
# Importando as bibliotecas necessárias
import os
import sys

import pandas as pd

sys.path.append(os.path.abspath("../../"))
from pipe.preprocess.utils import to_snake_case, clean_whitespace

In [32]:
# Configurar pandas para exibir todas as colunas
pd.set_option('display.max_columns', None)

---

# Carregar data set

In [33]:
df_economic = pd.read_csv('../../data/raw/EconomicData_ZCTAs.csv')
df_economic.head()

Unnamed: 0.1,Unnamed: 0,id,Geographic Area Name,TotalHouseholds_LessThan$10.000,TotalHouseholds_$10.000to$14.999,TotalHouseholds_$15.000to$24.999,TotalHouseholds_$25.000to$34.999,TotalHouseholds_$35.000to$49.999,TotalHouseholds_$50.000to$74.999,TotalHouseholds_$75.000to$99.999,TotalHouseholds_$100.000to$149.999,TotalHouseholds_$150.000to$199.999,TotalHouseholds_$200.000OrMore
0,0,8600000US35004,ZCTA5 35004,198,71,298,513,647,1117,529,945,245,61
1,1,8600000US35005,ZCTA5 35005,188,184,318,293,353,562,299,407,67,26
2,2,8600000US35006,ZCTA5 35006,71,20,117,104,154,176,124,194,51,7
3,3,8600000US35007,ZCTA5 35007,396,208,670,462,1173,1854,1578,2224,473,254
4,4,8600000US35010,ZCTA5 35010,700,610,1093,957,1056,1512,807,749,254,249


In [34]:
# Deletar a coluna 'Unnamed: 0' (apenas é um índice)
df_economic.drop(columns=['Unnamed: 0'], inplace=True)

---

# Verificar valores duplicados

In [35]:
# Verificando a quantidade de linhas duplicadas
df_economic.duplicated().sum()

np.int64(596160)

In [36]:
# Removendo as linhas duplicadas
df_economic.drop_duplicates(inplace=True)

In [37]:
# Verificando a quantidade de linhas duplicadas
df_economic.duplicated().sum()

np.int64(0)

---

# Analisar os tipos de dados

In [38]:
df_economic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33120 entries, 0 to 33119
Data columns (total 12 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   id                                  33120 non-null  object
 1   Geographic Area Name                33120 non-null  object
 2   TotalHouseholds_LessThan$10.000     33120 non-null  int64 
 3   TotalHouseholds_$10.000to$14.999    33120 non-null  int64 
 4   TotalHouseholds_$15.000to$24.999    33120 non-null  int64 
 5   TotalHouseholds_$25.000to$34.999    33120 non-null  int64 
 6   TotalHouseholds_$35.000to$49.999    33120 non-null  int64 
 7   TotalHouseholds_$50.000to$74.999    33120 non-null  int64 
 8   TotalHouseholds_$75.000to$99.999    33120 non-null  int64 
 9   TotalHouseholds_$100.000to$149.999  33120 non-null  int64 
 10  TotalHouseholds_$150.000to$199.999  33120 non-null  int64 
 11  TotalHouseholds_$200.000OrMore      33120 non-null  int64 


Todas as colunas aparentam a tipagem correta, no entanto imaginando criar a pipeline, pode-se forçar a tipagem das colunas que contenha a string `TotalHouseholds`

In [39]:
num_cols = [col for col in df_economic.columns if col.startswith('TotalHouseholds')]
df_economic[num_cols] = df_economic[num_cols].apply(pd.to_numeric, errors='coerce')

---

# Remover outliers

Considerando que os valores em `TotalHouseholds` representam uma contagem real dos domicílios em cada região. Valores altos ou baixos refletem as características demográficas do zipcode e não erros ou ruídos que distorçam uma análise estatística. 

---

# Organizar os dados

Como a coluna `Geographic Area Name` contém o valor de zipcode, é realizado a extração.

In [40]:
# Extraindo o zipcode da coluna 'Geographic Area Name'
df_economic['zipcode'] = df_economic['Geographic Area Name'].str.extract(r'(\d{5})')

# Como a coluna 'Geographic Area Name' não é mais necessária, pode-se deletá-la
df_economic.drop(columns='Geographic Area Name', inplace=True)


df_economic.head()

Unnamed: 0,id,TotalHouseholds_LessThan$10.000,TotalHouseholds_$10.000to$14.999,TotalHouseholds_$15.000to$24.999,TotalHouseholds_$25.000to$34.999,TotalHouseholds_$35.000to$49.999,TotalHouseholds_$50.000to$74.999,TotalHouseholds_$75.000to$99.999,TotalHouseholds_$100.000to$149.999,TotalHouseholds_$150.000to$199.999,TotalHouseholds_$200.000OrMore,zipcode
0,8600000US35004,198,71,298,513,647,1117,529,945,245,61,35004
1,8600000US35005,188,184,318,293,353,562,299,407,67,26,35005
2,8600000US35006,71,20,117,104,154,176,124,194,51,7,35006
3,8600000US35007,396,208,670,462,1173,1854,1578,2224,473,254,35007
4,8600000US35010,700,610,1093,957,1056,1512,807,749,254,249,35010


In [41]:
# Com o objetivo de resduzir o número de colunas e facilitar a análise, pode-se transformar as colunas que possuem o mesmo prefixo em uma única coluna
df_economic = pd.melt(df_economic,
                  id_vars=['id', 'zipcode'], 
                  value_vars=num_cols,
                  var_name='household_range',
                  value_name='TotalHouseholds')

In [42]:
# Melhorando a legibilidade dos dados
df_economic['household_range'] = (
    df_economic['household_range']
    .str.replace('TotalHouseholds_', '', regex=False)
    .str.replace('to', ' - ', regex=False)
    .str.replace('LessThan', '< ', regex=False)
    .str.replace('OrMore', ' >=', regex=False)
)

In [43]:
# Verificando resultado
df_economic.head()

Unnamed: 0,id,zipcode,household_range,TotalHouseholds
0,8600000US35004,35004,< $10.000,198
1,8600000US35005,35005,< $10.000,188
2,8600000US35006,35006,< $10.000,71
3,8600000US35007,35007,< $10.000,396
4,8600000US35010,35010,< $10.000,700


# Verificar valores nulos e ausentes

In [44]:
# Verificando a quantidade de valores nulos
df_economic.isnull().sum()

id                 0
zipcode            0
household_range    0
TotalHouseholds    0
dtype: int64

In [45]:
# Verificando a quantidade de valores ausentes
df_economic.isna().sum()

id                 0
zipcode            0
household_range    0
TotalHouseholds    0
dtype: int64

---

# Transformações Gerais

In [46]:
# Transformando as colunas em snake_case
to_snake_case(df_economic)

# Limpando os espaços em branco
clean_whitespace(df_economic)

In [47]:
df_economic.head()

Unnamed: 0,id,zipcode,household_range,total_households
0,8600000US35004,35004,< $10.000,198
1,8600000US35005,35005,< $10.000,188
2,8600000US35006,35006,< $10.000,71
3,8600000US35007,35007,< $10.000,396
4,8600000US35010,35010,< $10.000,700


---

# Validação Final

In [48]:
df_economic.describe(include='all')

Unnamed: 0,id,zipcode,household_range,total_households
count,331200,331200.0,331200,331200.0
unique,33120,33120.0,10,
top,8600000US00906,906.0,< $10.000,
freq,10,10.0,33120,
mean,,,,368.194876
std,,,,654.321233
min,,,,0.0
25%,,,,17.0
50%,,,,87.0
75%,,,,402.0


---

# Exportar data set limpo

In [49]:
df_economic.to_csv('../../data/processed/economic_data_clean.csv', index=False)