In [None]:
# Importando as bibliotecas necessárias
import os
import sys

import pandas as pd

sys.path.append(os.path.abspath("../../"))
from scripts.utils import to_snake_case, clean_whitespace

In [41]:
# Configurar pandas para exibir todas as colunas
pd.set_option('display.max_columns', None)

---

# Carregar data set

In [42]:
df_demographic = pd.read_csv('../../data/raw/DemographicData_ZCTAs.csv')
df_demographic.head()

Unnamed: 0.1,Unnamed: 0,Id,GeographicAreaName,TotalPopulation,SexRatio(males per 100 females),Population_Under5Years,Population_5to9Years,Population_10to14Years,Population_15to19Years,Population_20to24Years,Population_25to34Years,Population_35to44Years,Population_45to54Years,Population_55to59Years,Population_60to64Years,Population_65to74Years,Population_75to84Years,Population_85YearsAndOver,MedianAgeInYears
0,0,8600000US35004,ZCTA5 35004,12045,94.1,805,1075,898,477,578,2088,1628,1200,886,683,1017,534,176,35.5
1,1,8600000US35005,ZCTA5 35005,7344,86.1,504,453,511,499,214,788,795,968,612,561,798,485,156,44.0
2,2,8600000US35006,ZCTA5 35006,2883,108.2,96,153,303,129,156,183,367,430,296,260,280,201,29,47.2
3,3,8600000US35007,ZCTA5 35007,26332,95.0,1936,1992,1837,1762,1376,3119,3849,3907,1665,1323,2096,1106,364,37.7
4,4,8600000US35010,ZCTA5 35010,20613,90.5,1306,1465,944,1217,1128,2513,2106,2950,1512,1472,2421,1155,424,42.6


In [43]:
# Deletar a coluna 'Unnamed: 0' (apenas é um índice)
df_demographic.drop(columns=['Unnamed: 0'], inplace=True)

---

# Verificar valores duplicados

In [44]:
# Verificando a quantidade de linhas duplicadas
df_demographic.duplicated().sum()

np.int64(0)

---

# Analisar os tipos de dados

In [45]:
df_demographic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33120 entries, 0 to 33119
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Id                               33120 non-null  object 
 1   GeographicAreaName               33120 non-null  object 
 2   TotalPopulation                  33120 non-null  int64  
 3   SexRatio(males per 100 females)  32620 non-null  float64
 4   Population_Under5Years           33120 non-null  int64  
 5   Population_5to9Years             33120 non-null  int64  
 6   Population_10to14Years           33120 non-null  int64  
 7   Population_15to19Years           33120 non-null  int64  
 8   Population_20to24Years           33120 non-null  int64  
 9   Population_25to34Years           33120 non-null  int64  
 10  Population_35to44Years           33120 non-null  int64  
 11  Population_45to54Years           33120 non-null  int64  
 12  Population_55to59Y

Todas as colunas aparentam a tipagem correta, no entanto imaginando criar a pipeline, pode-se forçar a tipagem das colunas que contenha a string `Population`

In [46]:
num_cols = [col for col in df_demographic.columns if col.startswith('Population')]
df_demographic[num_cols] = df_demographic[num_cols].apply(pd.to_numeric, errors='coerce')

In [47]:
# Verificando novamente o tipo de dados
df_demographic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33120 entries, 0 to 33119
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Id                               33120 non-null  object 
 1   GeographicAreaName               33120 non-null  object 
 2   TotalPopulation                  33120 non-null  int64  
 3   SexRatio(males per 100 females)  32620 non-null  float64
 4   Population_Under5Years           33120 non-null  int64  
 5   Population_5to9Years             33120 non-null  int64  
 6   Population_10to14Years           33120 non-null  int64  
 7   Population_15to19Years           33120 non-null  int64  
 8   Population_20to24Years           33120 non-null  int64  
 9   Population_25to34Years           33120 non-null  int64  
 10  Population_35to44Years           33120 non-null  int64  
 11  Population_45to54Years           33120 non-null  int64  
 12  Population_55to59Y

---

# Verificar valores nulos e ausentes

In [48]:
# Verificando a quantidade de valores nulos
df_demographic.isnull().sum()

Id                                   0
GeographicAreaName                   0
TotalPopulation                      0
SexRatio(males per 100 females)    500
Population_Under5Years               0
Population_5to9Years                 0
Population_10to14Years               0
Population_15to19Years               0
Population_20to24Years               0
Population_25to34Years               0
Population_35to44Years               0
Population_45to54Years               0
Population_55to59Years               0
Population_60to64Years               0
Population_65to74Years               0
Population_75to84Years               0
Population_85YearsAndOver            0
MedianAgeInYears                   569
dtype: int64

In [49]:
# Verificando a quantidade de valores ausentes
df_demographic.isna().sum()

Id                                   0
GeographicAreaName                   0
TotalPopulation                      0
SexRatio(males per 100 females)    500
Population_Under5Years               0
Population_5to9Years                 0
Population_10to14Years               0
Population_15to19Years               0
Population_20to24Years               0
Population_25to34Years               0
Population_35to44Years               0
Population_45to54Years               0
Population_55to59Years               0
Population_60to64Years               0
Population_65to74Years               0
Population_75to84Years               0
Population_85YearsAndOver            0
MedianAgeInYears                   569
dtype: int64

In [50]:
# Verificando os valores ausentes de 'SexRatio(males per 100 females)'
df_demographic[df_demographic['SexRatio(males per 100 females)'].isna()]

Unnamed: 0,Id,GeographicAreaName,TotalPopulation,SexRatio(males per 100 females),Population_Under5Years,Population_5to9Years,Population_10to14Years,Population_15to19Years,Population_20to24Years,Population_25to34Years,Population_35to44Years,Population_45to54Years,Population_55to59Years,Population_60to64Years,Population_65to74Years,Population_75to84Years,Population_85YearsAndOver,MedianAgeInYears
5,8600000US35013,ZCTA5 35013,46,,0,0,0,0,0,0,15,31,0,0,0,0,0,47.9
47,8600000US35082,ZCTA5 35082,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,
142,8600000US35457,ZCTA5 35457,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,
403,8600000US36267,ZCTA5 36267,7,,0,0,0,0,0,0,0,0,7,0,0,0,0,
539,8600000US36590,ZCTA5 36590,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32920,8600000US82715,ZCTA5 82715,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,
32955,8600000US82936,ZCTA5 82936,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,
32981,8600000US83121,ZCTA5 83121,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,
33099,8600000US00950,ZCTA5 00950,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,


Como o dado aparenta não ser relavante, pode-se deletar

In [51]:
# Deletando os valores ausentes de 'SexRatio(males per 100 females)'
df_demographic.dropna(subset=['SexRatio(males per 100 females)'], inplace=True)

In [52]:
# Verificando novamente
df_demographic[df_demographic['SexRatio(males per 100 females)'].isna()]

Unnamed: 0,Id,GeographicAreaName,TotalPopulation,SexRatio(males per 100 females),Population_Under5Years,Population_5to9Years,Population_10to14Years,Population_15to19Years,Population_20to24Years,Population_25to34Years,Population_35to44Years,Population_45to54Years,Population_55to59Years,Population_60to64Years,Population_65to74Years,Population_75to84Years,Population_85YearsAndOver,MedianAgeInYears


---

# Organizar os dados

Como a coluna `GeographicAreaName` contém o valor de zipcode, é realizado a extração.

In [53]:
df_demographic['zipcode'] = df_demographic['GeographicAreaName'].str.extract(r'(\d{5})')

In [54]:
# Como a coluna 'GeographicAreaName' não é mais necessária, pode-se deletá-la
df_demographic.drop(columns='GeographicAreaName', inplace=True)

In [55]:
# Com o objetivo de resduzir o número de colunas e facilitar a análise, pode-se transformar as colunas que possuem o mesmo prefixo (`Population`) em uma única coluna
df_demographic = pd.melt(
    df_demographic,
    id_vars=['Id', 'zipcode', 'TotalPopulation', 'SexRatio(males per 100 females)', 'MedianAgeInYears'],
    value_vars=num_cols,
    var_name='AgeGroup',
    value_name='group_population'
)

In [56]:
# Verificando o resultado
df_demographic.tail()

Unnamed: 0,Id,zipcode,TotalPopulation,SexRatio(males per 100 females),MedianAgeInYears,AgeGroup,group_population
424055,8600000US00979,979,16130,91.6,48.7,Population_85YearsAndOver,546
424056,8600000US00982,982,14346,81.0,44.8,Population_85YearsAndOver,595
424057,8600000US00983,983,33947,81.4,40.8,Population_85YearsAndOver,1010
424058,8600000US00985,985,31682,82.4,43.6,Population_85YearsAndOver,901
424059,8600000US00987,987,54241,87.3,41.2,Population_85YearsAndOver,1102


In [57]:
# Melhorando a legibilidade dos dados
df_demographic['AgeGroup'] = (
    df_demographic['AgeGroup']
    .str.replace('Population_', '', regex=False)
    .str.replace('to', ' - ', regex=False)
    .str.replace('Under', '< ', regex=False)
    .str.replace('AndOver', ' >=', regex=False)
    .str.replace('Years', ' Years', regex=False)
)

# Transformações Gerais

In [58]:
# Transformando as colunas em snake_case
to_snake_case(df_demographic)

# Limpando os espaços em branco
clean_whitespace(df_demographic)

---

# Validação Final

In [59]:
df_demographic.describe(include='all')

Unnamed: 0,id,zipcode,total_population,sex_ratiomales_per_100_females,median_age_in_years,age_group,group_population
count,424060,424060.0,424060.0,424060.0,422539.0,424060,424060.0
unique,32620,32620.0,,,,13,
top,8600000US00987,987.0,,,,< 5 Years,
freq,13,13.0,,,,32620,
mean,,,10054.024372,116.842275,42.701018,,773.38649
std,,,14775.293911,1483.139397,9.091367,,1365.445008
min,,,3.0,0.5,1.9,,0.0
25%,,,751.0,91.3,37.1,,45.0
50%,,,2924.0,98.4,42.0,,199.0
75%,,,13773.0,107.9,47.6,,895.0


---

# Exportar data set limpo

In [61]:
df_demographic.to_csv('../../data/interim/demographic_data_clean.csv', index=False)