# main.ipynb

## modules

### common

In [1]:
# Trabaja con ficheros y carpetas
import os
# Ofrece operaciones de alto nivel en archivos
import shutil
# Trabaja con datos de API .json
import json
# Implementa identificadores únicos de recursos
from uuid import uuid4
# Permite crear carpetas y rutas
from pathlib import Path

### requirements

In [2]:
# Permite realizar operaciones REST con las API´s
import requests
# Facilita la manipulación de DateTime
import pendulum
# Manejo y análisis de estructuras de datos
import pandas as pd
#
import fire

## 1. pendulum

In [3]:
# > variables
# Almacena la regíon y la capital de la fecha que quiero saber
region_city: str = 'America/Lima'

In [4]:
# Usamos pendulum para obtener la fecha de la region almacenada
c_date = pendulum.now(rc := region_city).to_date_string()

print(c_date, '---', rc)

2022-06-04 --- America/Lima


## 2. requests

In [5]:
# > variables
# Numero de serie
serie = 'PM04902AA'
# URL de la API del BCR
api_url = f'https://estadisticas.bcrp.gob.pe/estadisticas/series/api/{serie}/json/2020/2022/'
# Cabezeras
headers = {'Content-Type': 'application/json'}

In [6]:
# > request + try-except
try:
    # Realizamos el request a la URL durante 60 segundos
    response = requests.get(url=api_url, headers=headers, timeout=60)
    # Forzar que el encoding del request sea UTF-8
    response.encoding = 'utf-8'
    # Transformamos el string del json a diccionario con ".loads"
    api_data = json.loads(response.text)
except Exception as e:
    print('[INFO] something went wrong...')
    # Captura el error 
    raise

In [7]:
print(json.dumps(api_data, indent=2))

{
  "config": {
    "title": "Ingreso nacional disponible (millones S/ 2007)",
    "series": [
      {
        "name": "Ingreso nacional disponible (millones S/ 2007) - Renta de Factores",
        "dec": "0"
      }
    ]
  },
  "periods": [
    {
      "name": "2020",
      "values": [
        "-14748.8775165815"
      ]
    },
    {
      "name": "2021",
      "values": [
        "-46048.2056891399"
      ]
    }
  ]
}


## 3. pandas: dataframes

In [8]:
# > variables
# Almacena donde se encuentran los diccionarios de datos
record_path = 'periods'
# Cambia el nombre de las columnas del json
columnas = {'name': 'YEAR', 'values': serie.upper()}

In [9]:
# > pandas from json
# Estructuramos los datos obtenidos del API
df = pd.json_normalize(api_data, record_path=record_path)
# Muestra las columnas
df.head()

Unnamed: 0,name,values
0,2020,[-14748.8775165815]
1,2021,[-46048.2056891399]


In [10]:
# > pandas rename column
# Permite renombrar las columnas
df = df.rename(columns=columnas)
# Mostramos las columnas
df.head()

Unnamed: 0,YEAR,PM04902AA
0,2020,[-14748.8775165815]
1,2021,[-46048.2056891399]


In [11]:
# > pandas format
# Obtiene el primer elemento de la lista y lo convierta en "float"
df[serie] = df[serie].str[0].astype('float')

df.head()

Unnamed: 0,YEAR,PM04902AA
0,2020,-14748.877517
1,2021,-46048.205689


## 4. pandas: exporting data

In [12]:
# > variables
# Guardamos la fecha en una nueva variable
curr_date = c_date
# Obtenemos un identificador unico en una variable
exec_uuid = str(uuid4())
# Guardamos la serie en una nueva variable
s = serie
# Almacenamos el path donde se almacenara los archivos .csv
save_path = f'./data/current/{curr_date}/{exec_uuid}/{s}.csv'
# Mostramos el path
print(save_path)

./data/current/2022-06-04/fc29d1fc-99fe-4d0a-a74b-d7375769eba9/PM04902AA.csv


In [14]:
# > split save path
# Usamos .split para separar el path por '/' y muestra todos los elementos menos el ultimo
# Y los unes con un '/'
save_folder = '/'.join(save_path.split('/')[:-1])
# Muestra la nueva ruta
print(save_folder)

./data/current/2022-06-04/fc29d1fc-99fe-4d0a-a74b-d7375769eba9


In [15]:
# > make save directory if not exists
Path(save_folder).mkdir(parents=True, exist_ok=True)

In [16]:
# > export data from dataframe to csv
# Almacenamos una copia del data frame
dfc = df.copy(deep=True)
# Convertimos la copia del data frame en archivo .csv (index=false no traslada los indices del dataframe)
dfc.to_csv(save_path, sep=';', encoding='iso-8859-1', index=False)

## 5. main scenario

### 5.1. multiple solicitudes

In [31]:
# > fixed
series = ['PM04901AA', 'PM04902AA', 'PM04903AA', 'PM04904AA', 'PM04905AA', 'PM04906AA', 'PM04907AA']
exec_uuid = str(uuid4())

for serie in series:
    # > variables
    api_url = f'https://estadisticas.bcrp.gob.pe/estadisticas/series/api/{serie}/json/2020/2022/'
    headers = {'Content-Type': 'application/json'}
    
    # > request + try-except
    try:
        response = requests.get(url=api_url, headers=headers, timeout=60)
        response.encoding = 'utf-8'
        api_data = json.loads(response.text)
    except Exception as e:
        print('[INFO] something went wrong...')
        raise
    
    # > variables
    record_path = 'periods'
    columnas = {'name': 'YEAR', 'values': serie.upper()}
    
    # > pandas from json
    df = pd.json_normalize(api_data, record_path=record_path)
    df = df.rename(columns=columnas)
    df[serie] = df[serie].str[0].astype('float')
    
    # > variables
    curr_date = c_date
    s = serie
    save_path = f'./data/current/{curr_date}/{exec_uuid}/{s}.csv'
    
    # > split save path
    save_folder = '/'.join(save_path.split('/')[:-1])
    
    # > make save directory if not exists
    Path(save_folder).mkdir(parents=True, exist_ok=True)
    
    # > export data from dataframe to csv
    dfc = df.copy(deep=True)
    dfc.to_csv(save_path, sep=';', encoding='iso-8859-1', index=False)

### 5.2. dataframe auxiliar

In [32]:
# > variables
curr_date = c_date ; year = curr_date.split('-')[0]
read_path = '/'.join(save_path.split('/')[:-1])

In [33]:
# > dataframe dummy
tdf = pd.DataFrame({'YEAR': range(1940, int(year)+1), 'LOAD_DATE': curr_date})
tdf = tdf.set_index('YEAR')

tdf.head()

Unnamed: 0_level_0,LOAD_DATE
YEAR,Unnamed: 1_level_1
1940,2022-06-04
1941,2022-06-04
1942,2022-06-04
1943,2022-06-04
1944,2022-06-04


In [34]:
# > elementos a juntar
files = os.listdir(read_path)
for f in files:
    print(f)

PM04901AA.csv
PM04902AA.csv
PM04903AA.csv
PM04904AA.csv
PM04905AA.csv
PM04906AA.csv
PM04907AA.csv


In [35]:
# > complete table
files = os.listdir(read_path)
for f in files:
    df = pd.read_csv(f'{read_path}/{f}', sep=';', encoding='iso-8859-1')
    df = df.set_index('YEAR')
    tdf = tdf.join(df)

In [36]:
# > nulos por dummy table
print(tdf.isna().sum())
tdf.tail()

LOAD_DATE     0
PM04901AA    81
PM04902AA    81
PM04903AA    81
PM04904AA    81
PM04905AA    81
PM04906AA    81
PM04907AA    81
dtype: int64


Unnamed: 0_level_0,LOAD_DATE,PM04901AA,PM04902AA,PM04903AA,PM04904AA,PM04905AA,PM04906AA,PM04907AA
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018,2022-06-04,,,,,,,
2019,2022-06-04,,,,,,,
2020,2022-06-04,486402.086278,-14748.877517,471653.208762,4700.533284,476353.742045,11922.8519,488276.593945
2021,2022-06-04,551829.119782,-46048.205689,505780.914093,20360.512843,526141.426936,13619.526712,539760.953647
2022,2022-06-04,,,,,,,


In [37]:
# > reset index + drop nulos
tdf = tdf.reset_index()
tdf = tdf.dropna(subset=tdf.columns[2:])

In [38]:
# > output
print(tdf.isna().sum())
tdf.head()

YEAR         0
LOAD_DATE    0
PM04901AA    0
PM04902AA    0
PM04903AA    0
PM04904AA    0
PM04905AA    0
PM04906AA    0
PM04907AA    0
dtype: int64


Unnamed: 0,YEAR,LOAD_DATE,PM04901AA,PM04902AA,PM04903AA,PM04904AA,PM04905AA,PM04906AA,PM04907AA
80,2020,2022-06-04,486402.086278,-14748.877517,471653.208762,4700.533284,476353.742045,11922.8519,488276.593945
81,2021,2022-06-04,551829.119782,-46048.205689,505780.914093,20360.512843,526141.426936,13619.526712,539760.953647


### 5.3. exportar datos

In [39]:
# > variables
export_path = f'./data/output/{curr_date}/output_{exec_uuid}.csv'
save_folder = '/'.join(export_path.split('/')[:-1])
sep = '|'
encoding='iso-8859-1'

In [40]:
# > export full table
Path(save_folder).mkdir(parents=True, exist_ok=True)
tdf.to_csv(export_path, sep=sep, encoding=encoding, index=False)

In [41]:
# > move current to historic
current_path = read_path
historic_path = read_path.replace('current', 'historic')
shutil.move(current_path, historic_path)

'./data/historic/2022-06-04/83be990f-7afa-43ca-b58b-754a9918b6f1'

In [42]:
tdf

Unnamed: 0,YEAR,LOAD_DATE,PM04901AA,PM04902AA,PM04903AA,PM04904AA,PM04905AA,PM04906AA,PM04907AA
80,2020,2022-06-04,486402.086278,-14748.877517,471653.208762,4700.533284,476353.742045,11922.8519,488276.593945
81,2021,2022-06-04,551829.119782,-46048.205689,505780.914093,20360.512843,526141.426936,13619.526712,539760.953647
