In [1]:
import pandas as pd
from datetime import datetime, timedelta
import os

In [2]:
# useful tools
current_year = (datetime.now() - timedelta(days=1) ).strftime('%Y')
months = {
    'Jan': '01', 'Fev': '02', 'Mar': '03', 'Abr': '04',
    'Mai': '05', 'Jun': '06', 'Jul': '07', 'Ago': '08',
    'Set': '09', 'Out': '10', 'Nov': '11', 'Dez': '12'
}

In [3]:
# Create a df with a CDI monthly web data and get all values from currenty month until january 2020

all_dfs = []


for i in range(0,100):
    page = f"https://www.vriconsulting.com.br/indices/cdi.php?pagina={i}"
    df = pd.read_html(page)[0].copy()
    all_dfs.append(df)
    if 'Jan/2000' in df['Mês/Ano'].values:
        break
    
cdi_df = pd.concat(all_dfs)
cdi_df.rename(columns={"Índice do mês (em %)": "CDI"}, inplace=True)
cdi_df = cdi_df[["CDI", "Mês/Ano"]]

# Avoid lost of data
cdi_df["CDI"][0] = cdi_df["CDI"][1]
cdi_df["CDI"] =  cdi_df["CDI"].astype('float')
# Transforming column "Mês/Ano" into a timestamp index
cdi_df["Mês/Ano"] = cdi_df["Mês/Ano"].str.capitalize().str.strip()
cdi_df["Mês/Ano"] = cdi_df["Mês/Ano"].str[:3].map(months) + cdi_df["Mês/Ano"].str[3:]
cdi_df["Mês/Ano"] = pd.to_datetime(cdi_df['Mês/Ano'], format='%m/%Y', errors='coerce').dt.tz_localize('UTC').dt.tz_convert('America/Sao_Paulo')
cdi_df = cdi_df.set_index("Mês/Ano")

# get all key data and resize values for later analysis
cdi_df = cdi_df.loc['2000-01-31 22:00:00-02:00':]
cdi_df = cdi_df / 10000
# convert anually acumulated CDI to monthly CDI





In [4]:
# Create a df with a CDI monthly web data and get all values from currenty month until january 2020
all_dfs = []

for year in range(2000,2040):
    if year == int(current_year) + 1:
        break
    df = pd.read_html(f"https://www.inflation.eu/pt/taxas-de-inflacao/brasil/inflacao-historica/ipc-inflacao-brasil-{year}.aspx")[10].copy()
    df = df[[0,1]]
    df = df.loc[1:]
    all_dfs.append(df)

inflation_df = pd.concat(all_dfs)
inflation_df.rename(columns={0: "Inflation/Month", 1 : "Inflation"}, inplace=True)
# Transforming column "Inflation/Month" to timestamp index
inflation_df['Inflation/Month'] = inflation_df['Inflation/Month'].str.capitalize().str.split('-').str[0]
inflation_df['Inflation/Month'] = inflation_df['Inflation/Month'].str[:3].map(months) + '/' + inflation_df['Inflation/Month'].str[-5:]
inflation_df['Inflation/Month'] = inflation_df['Inflation/Month'].str.strip()   
inflation_df["Inflation/Month"] = pd.to_datetime(inflation_df["Inflation/Month"], format='%m/%Y', errors='coerce')
inflation_df["Inflation/Month"] = inflation_df["Inflation/Month"].dt.tz_localize('UTC').dt.tz_convert('America/Sao_Paulo')
inflation_df.set_index('Inflation/Month', inplace=True, drop=True)

# get all key data and transform values to float for later analysis
inflation_df = inflation_df.loc['2000-01-31 22:00:00-02:00':]
inflation_df = inflation_df[inflation_df['Inflation'].str.contains('%')]
inflation_df['Inflation'] = inflation_df['Inflation'].str.replace('%', "").str.replace(',', '.').str.strip()
inflation_df['Inflation'] = inflation_df.astype('float')


In [5]:
inflation_df

Unnamed: 0_level_0,Inflation
Inflation/Month,Unnamed: 1_level_1
2000-01-31 22:00:00-02:00,0.13
2000-02-29 21:00:00-03:00,0.22
2000-03-31 21:00:00-03:00,0.42
2000-04-30 21:00:00-03:00,0.01
2000-05-31 21:00:00-03:00,0.23
...,...
2023-06-30 21:00:00-03:00,0.12
2023-07-31 21:00:00-03:00,0.23
2023-08-31 21:00:00-03:00,0.26
2023-09-30 21:00:00-03:00,0.24


In [6]:
cdi_df

Unnamed: 0_level_0,CDI
Mês/Ano,Unnamed: 1_level_1
2023-11-30 21:00:00-03:00,0.9160
2023-10-31 21:00:00-03:00,0.9160
2023-09-30 21:00:00-03:00,0.9976
2023-08-31 21:00:00-03:00,0.9729
2023-07-31 21:00:00-03:00,1.1375
...,...
2000-05-31 21:00:00-03:00,1.3852
2000-04-30 21:00:00-03:00,1.4881
2000-03-31 21:00:00-03:00,1.2838
2000-02-29 21:00:00-03:00,1.4389


In [7]:
df = pd.merge(inflation_df, cdi_df, left_index=True,right_index=True, how='outer')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 287 entries, 2000-01-31 22:00:00-02:00 to 2023-11-30 21:00:00-03:00
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Inflation  286 non-null    float64
 1   CDI        287 non-null    float64
dtypes: float64(2)
memory usage: 6.7 KB


In [9]:
df['Inflation'].ffill(inplace=True )
df

Unnamed: 0,Inflation,CDI
2000-01-31 22:00:00-02:00,0.13,1.4405
2000-02-29 21:00:00-03:00,0.22,1.4389
2000-03-31 21:00:00-03:00,0.42,1.2838
2000-04-30 21:00:00-03:00,0.01,1.4881
2000-05-31 21:00:00-03:00,0.23,1.3852
...,...,...
2023-07-31 21:00:00-03:00,0.23,1.1375
2023-08-31 21:00:00-03:00,0.26,0.9729
2023-09-30 21:00:00-03:00,0.24,0.9976
2023-10-31 21:00:00-03:00,0.28,0.9160


In [10]:
FILE_FOLDER = os.path.abspath('')
df.to_csv(FILE_FOLDER + '/Inflation_CDI_data', index=True)
