In [3]:
import pandas as pd
import requests

# Population
Ejemplo de tratamiento de datos de CENSUS. Se extrae datos de la API.

In [118]:
url = r"https://api.census.gov/data/2023/acs/acs1/subject?get=group(S0101)&ucgid=pseudo(0100000US$0400000)"

In [119]:
response = requests.get(url)

In [120]:
df = pd.DataFrame(data=response.json())

In [110]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,905,906,907,908,909,910,911,912,913,914
0,GEO_ID,NAME,S0101_C01_001E,S0101_C01_001EA,S0101_C01_001M,S0101_C01_001MA,S0101_C01_002E,S0101_C01_002EA,S0101_C01_002M,S0101_C01_002MA,...,S0101_C06_036MA,S0101_C06_037E,S0101_C06_037EA,S0101_C06_037M,S0101_C06_037MA,S0101_C06_038E,S0101_C06_038EA,S0101_C06_038M,S0101_C06_038MA,ucgid
1,0400000US01,Alabama,5108468,,-555555555,*****,288019,,3288,,...,(X),-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),0400000US01
2,0400000US02,Alaska,733406,,-555555555,*****,45211,,1366,,...,(X),-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),0400000US02
3,0400000US04,Arizona,7431344,,-555555555,*****,391142,,1573,,...,(X),-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),0400000US04
4,0400000US05,Arkansas,3067732,,-555555555,*****,176908,,2503,,...,(X),-888888888,(X),-888888888,(X),-888888888,(X),-888888888,(X),0400000US05


In [90]:
url = r"https://api.census.gov/data/2022/acs/acs1/subject?get=group(S0101)&ucgid=pseudo(0100000US$0400000)"
response = requests.get(url)
df = pd.DataFrame(data=response.json())

In [96]:
# Ejemplo de tratamiento de tabla
df.columns = df.iloc[0]
df.drop(0, inplace=True)

## Funcion de tratamiento
En base al proceso anterior, definimos una funcion

In [4]:
def df_treatment(df):
    df_copy = df.copy()
    df_copy.columns = df_copy.iloc[0]
    df_copy.drop(0, inplace=True)
    df_copy = df_copy.iloc[:, :3]
    return df_copy

In [125]:
df_copy = df_treatment(df)

In [150]:
df_copy.head()

Unnamed: 0,GEO_ID,NAME,S0101_C01_001E
1,0400000US01,Alabama,5108468
2,0400000US02,Alaska,733406
3,0400000US04,Arizona,7431344
4,0400000US05,Arkansas,3067732
5,0400000US06,California,38965193


# Iteracion

In [5]:
import time

In [6]:
# Definimos los años.
year_range = range(2014, 2024, 1)

In [7]:
merged_df = pd.DataFrame()
iter_count = 0

for year in year_range:
    # Obtener DF
    url = rf"https://api.census.gov/data/{year}/acs/acs1/subject?get=group(S0101)&ucgid=pseudo(0100000US$0400000)"

    # Intentamos obtener los datos via REQUESTS de la API de Census.
    try:
        response = requests.get(url)
        df = pd.DataFrame(data=response.json())
        # Realizar tratamiento:
        df = df_treatment(df)
        column = df.columns[2]
        df.rename(columns={column : 'population '+ str(year)},inplace=True)
        df.drop(columns='GEO_ID', inplace=True)
    except Exception as e:
        print(e)
        print(iter_count, year)
        continue
    
    # Intentamos hacer merge.
    if iter_count==0:
        iter_count+=1
        merged_df = df.copy()
    else:
        try: 
            merged_df = pd.merge(merged_df, df , on='NAME')
            iter_count+=1
        except Exception as e:
            print(e)

    # Imprimimos un log.
    print(iter_count, year)

    time.sleep(1) # tiempo entre consulta: 1 segundo.
    

1 2014
2 2015
3 2016
4 2017
5 2018
6 2019
Expecting value: line 1 column 1 (char 0)
6 2020
7 2021
8 2022
9 2023


# Tratamiento final

In [8]:
# Damos forma al dataset
state_population_df = pd.melt(merged_df, id_vars='NAME', value_vars=merged_df.columns[1:], var_name='year', value_name='population' ).sort_values(by='NAME')

In [9]:
# Convertimos los datos de la columna year.
state_population_df['year'] = state_population_df['year'].str.split(' ').apply(lambda row: row[1])

In [10]:
state_population_df.reset_index(drop=True, inplace=True)

In [11]:
state_population_df.rename(columns={
    'NAME': 'state'
}, inplace=True)

In [12]:
state_population_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 468 entries, 0 to 467
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   state       468 non-null    object
 1   year        468 non-null    object
 2   population  468 non-null    object
dtypes: object(3)
memory usage: 11.1+ KB


# Carga a BigQuery

In [None]:
from google.cloud import bigquery
import os
# Definimos nuestra variable de entorno para credenciales de BQ
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"F:\DataScience\PF - DataNova\gcp key\bamboo-zone-445202-a3-be5b705efba8.json"

In [184]:
client = bigquery.Client()

In [185]:
table_full_id = 'bamboo-zone-445202-a3.Datanova_CoffeeShops.population_per_state'

In [186]:
job = client.load_table_from_dataframe(
    state_population_df,  
    table_full_id,
    job_config=bigquery.LoadJobConfig(
        write_disposition="WRITE_TRUNCATE",
        autodetect=False  
    )
)

job.result()

LoadJob<project=bamboo-zone-445202-a3, location=US, id=841b3a4e-78c2-4222-8d25-57922dee84a4>

# Guardado local

In [None]:
state_population_df.to_csv(r'F:\DataScience\PF - DataNova\Aux_scripts\output\population.csv')