In [134]:
import yfinance as yf
from datetime import datetime, timedelta
from get_all_tickers import get_tickers as gt
import seaborn as sns
import pandas as pd
import numpy as np

## Step 1: Collect and transforming the data from yfinance


In [135]:
br_df = pd.read_html("https://www.dadosdemercado.com.br/bolsa/acoes").copy()
tickers = br_df[0]['Ticker']
tickers = [tickers + ".SA" for tickers in tickers]

In [136]:
# def get_data(stock_df_name, per=None ,start=None , end=None ):
# get all major data to the project
big_data = yf.Ticker('ITUB4.SA')
# start_date = '1990-01-01'
# end_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

stock_df = big_data.history(period='max' ).copy()

# Transforming the data to analysis
columns = ["Close", 'Volume', 'Open', 'High', 'Low', 'Dividends']

stock_df = pd.DataFrame(stock_df[columns])
stock_df['Yield'] = stock_df['Dividends'] / stock_df['Close']
stock_df = stock_df.astype('float')

# return stock_df

In [137]:
today = (datetime.now() - timedelta(days=1) ).strftime('%Y-%m-%d')
month_1 = (datetime.now() - timedelta(days=31)).strftime('%Y-%m-%d')
year_1 = (datetime.now() - timedelta(days=366)).strftime('%Y-%m-%d')
year_5 = (datetime.now() - timedelta(days=(365 * 5  + 1))).strftime('%Y-%m-%d')

In [138]:
stock_df


Unnamed: 0_level_0,Close,Volume,Open,High,Low,Dividends,Yield
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-12-21 00:00:00-02:00,1.688831,74224.0,1.606090,1.704794,1.605901,0.00000,0.000000
2000-12-22 00:00:00-02:00,1.615630,23312.0,1.671830,1.671830,1.605712,0.00000,0.000000
2000-12-25 00:00:00-02:00,1.615630,0.0,1.615630,1.615630,1.615630,0.00000,0.000000
2000-12-26 00:00:00-02:00,1.615157,7851.0,1.606751,1.617046,1.591544,0.00000,0.000000
2000-12-27 00:00:00-02:00,1.669941,26996.0,1.643494,1.669941,1.615158,0.00000,0.000000
...,...,...,...,...,...,...,...
2023-12-27 00:00:00-03:00,33.862396,18176500.0,33.592536,33.932360,33.552556,0.00000,0.000000
2023-12-28 00:00:00-03:00,33.952351,12555300.0,33.862397,33.952351,33.722471,0.00000,0.000000
2024-01-02 00:00:00-03:00,33.520000,18948200.0,33.889999,33.889999,33.240002,0.01765,0.000527
2024-01-03 00:00:00-03:00,33.150002,18530900.0,33.509998,33.770000,33.150002,0.00000,0.000000


In [139]:
# initial and final values from time series to create a condicional line chart color
init_value = stock_df['Close'].iloc[0]
last_value = stock_df['Close'].iloc[-1]
line_color = "red" if init_value > last_value else "green"

## Step 2: Collect and transforming the data from vriconsulting


In [140]:


all_dfs = []
# Create a df with a web data and get all values from currenty month until january 2020

for i in range(0,100):
    url = f"https://www.vriconsulting.com.br/indices/cdi.php?pagina={i}"
    df = pd.read_html(url)[0].copy()
    all_dfs.append(df)
    if 'Jan/2000' in df['Mês/Ano'].values:
        break
cdi_df = pd.concat(all_dfs)
cdi_df.rename(columns={"Índice do mês (em %)": "CDI/Month"}, inplace=True)
cdi_df = cdi_df[["CDI/Month", "Mês/Ano"]]

# Avoid lost of data
cdi_df["CDI/Month"][0] = cdi_df["CDI/Month"][1]
cdi_df["CDI/Month"] =  cdi_df["CDI/Month"].astype('float')


In [141]:
cdi_df["Mês/Ano"].str[:3]

0     dez
1     Nov
2     Out
3     Set
4     Ago
     ... 
35    Ago
36    Jul
37    Jun
38    Mai
39    Abr
Name: Mês/Ano, Length: 321, dtype: object

In [142]:
cdi_df

Unnamed: 0,CDI/Month,Mês/Ano
0,9160.0,dez/2023
1,9160.0,Nov/2023
2,9976.0,Out/2023
3,9729.0,Set/2023
4,11375.0,Ago/2023
...,...,...
35,15808.0,Ago/1997
36,16059.0,Jul/1997
37,15919.0,Jun/1997
38,15784.0,Mai/1997


In [143]:
months = {
    'Jan': '01', 'Fev': '02', 'Mar': '03', 'Abr': '04',
    'Mai': '05', 'Jun': '06', 'Jul': '07', 'Ago': '08',
    'Set': '09', 'Out': '10', 'Nov': '11', 'Dez': '12'
}

cdi_df["Mês/Ano"] = cdi_df["Mês/Ano"].str.capitalize().str.strip()
cdi_df["Mês/Ano"] = cdi_df["Mês/Ano"].str[:3].map(months) + cdi_df["Mês/Ano"].str[3:]
cdi_df["Mês/Ano"] = pd.to_datetime(cdi_df['Mês/Ano'], format='%m/%Y', errors='coerce').dt.tz_localize('UTC').dt.tz_convert('America/Sao_Paulo')




In [144]:
cdi_df['Mês/Ano']

0    2023-11-30 21:00:00-03:00
1    2023-10-31 21:00:00-03:00
2    2023-09-30 21:00:00-03:00
3    2023-08-31 21:00:00-03:00
4    2023-07-31 21:00:00-03:00
                ...           
35   1997-07-31 21:00:00-03:00
36   1997-06-30 21:00:00-03:00
37   1997-05-31 21:00:00-03:00
38   1997-04-30 21:00:00-03:00
39   1997-03-31 21:00:00-03:00
Name: Mês/Ano, Length: 321, dtype: datetime64[ns, America/Sao_Paulo]

In [145]:
stock_df.index

DatetimeIndex(['2000-12-21 00:00:00-02:00', '2000-12-22 00:00:00-02:00',
               '2000-12-25 00:00:00-02:00', '2000-12-26 00:00:00-02:00',
               '2000-12-27 00:00:00-02:00', '2000-12-28 00:00:00-02:00',
               '2000-12-29 00:00:00-02:00', '2001-01-01 00:00:00-02:00',
               '2001-01-02 00:00:00-02:00', '2001-01-03 00:00:00-02:00',
               ...
               '2023-12-19 00:00:00-03:00', '2023-12-20 00:00:00-03:00',
               '2023-12-21 00:00:00-03:00', '2023-12-22 00:00:00-03:00',
               '2023-12-26 00:00:00-03:00', '2023-12-27 00:00:00-03:00',
               '2023-12-28 00:00:00-03:00', '2024-01-02 00:00:00-03:00',
               '2024-01-03 00:00:00-03:00', '2024-01-04 00:00:00-03:00'],
              dtype='datetime64[ns, America/Sao_Paulo]', name='Date', length=5776, freq=None)

In [146]:
cdi_df = cdi_df.set_index("Mês/Ano")



In [147]:
# fill the dataframe with day values

cdi_df = cdi_df.resample('D').ffill()
cdi_df

Unnamed: 0_level_0,CDI/Month
Mês/Ano,Unnamed: 1_level_1
1997-03-31 00:00:00-03:00,
1997-04-01 00:00:00-03:00,16563.0
1997-04-02 00:00:00-03:00,16563.0
1997-04-03 00:00:00-03:00,16563.0
1997-04-04 00:00:00-03:00,16563.0
...,...
2023-11-26 00:00:00-03:00,9160.0
2023-11-27 00:00:00-03:00,9160.0
2023-11-28 00:00:00-03:00,9160.0
2023-11-29 00:00:00-03:00,9160.0


In [148]:

cdi_df = cdi_df[cdi_df.index.isin(stock_df.index)]


In [149]:
cdi_df = (cdi_df.astype('float') / 100000)


In [150]:
# convert the CDI rate from anually to daily 
cdi_df['CDI/Month'] = (1 + cdi_df['CDI/Month'] ) ** (1/365) - 1
cdi_df

Unnamed: 0_level_0,CDI/Month
Mês/Ano,Unnamed: 1_level_1
2000-12-21 00:00:00-02:00,0.000309
2000-12-22 00:00:00-02:00,0.000309
2000-12-25 00:00:00-02:00,0.000309
2000-12-26 00:00:00-02:00,0.000309
2000-12-27 00:00:00-02:00,0.000309
...,...
2023-11-24 00:00:00-03:00,0.000240
2023-11-27 00:00:00-03:00,0.000240
2023-11-28 00:00:00-03:00,0.000240
2023-11-29 00:00:00-03:00,0.000240


In [151]:
stock_df

Unnamed: 0_level_0,Close,Volume,Open,High,Low,Dividends,Yield
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-12-21 00:00:00-02:00,1.688831,74224.0,1.606090,1.704794,1.605901,0.00000,0.000000
2000-12-22 00:00:00-02:00,1.615630,23312.0,1.671830,1.671830,1.605712,0.00000,0.000000
2000-12-25 00:00:00-02:00,1.615630,0.0,1.615630,1.615630,1.615630,0.00000,0.000000
2000-12-26 00:00:00-02:00,1.615157,7851.0,1.606751,1.617046,1.591544,0.00000,0.000000
2000-12-27 00:00:00-02:00,1.669941,26996.0,1.643494,1.669941,1.615158,0.00000,0.000000
...,...,...,...,...,...,...,...
2023-12-27 00:00:00-03:00,33.862396,18176500.0,33.592536,33.932360,33.552556,0.00000,0.000000
2023-12-28 00:00:00-03:00,33.952351,12555300.0,33.862397,33.952351,33.722471,0.00000,0.000000
2024-01-02 00:00:00-03:00,33.520000,18948200.0,33.889999,33.889999,33.240002,0.01765,0.000527
2024-01-03 00:00:00-03:00,33.150002,18530900.0,33.509998,33.770000,33.150002,0.00000,0.000000


In [152]:
df = pd.merge(cdi_df, stock_df, left_index=True, right_index=True, how='outer')
df['CDI/Month'].ffill(inplace=True)


In [153]:
df['Return_Stock'] = (df['Close'] + df['Yield'] - df['Close'].shift(1) ) / df['Close'].iloc[0] 
df['Return_CDI'] = (df['Close'].iloc[0] * df['CDI/Month'] )  / df['Close'].iloc[0]


In [154]:
df['Return_Stock'] = df['Return_Stock'].cumsum()
df['Return_CDI'] = df['Return_CDI'].cumsum()

In [155]:
df

Unnamed: 0,CDI/Month,Close,Volume,Open,High,Low,Dividends,Yield,Return_Stock,Return_CDI
2000-12-21 00:00:00-02:00,0.000309,1.688831,74224.0,1.606090,1.704794,1.605901,0.00000,0.000000,,0.000309
2000-12-22 00:00:00-02:00,0.000309,1.615630,23312.0,1.671830,1.671830,1.605712,0.00000,0.000000,-0.043344,0.000618
2000-12-25 00:00:00-02:00,0.000309,1.615630,0.0,1.615630,1.615630,1.615630,0.00000,0.000000,-0.043344,0.000927
2000-12-26 00:00:00-02:00,0.000309,1.615157,7851.0,1.606751,1.617046,1.591544,0.00000,0.000000,-0.043624,0.001236
2000-12-27 00:00:00-02:00,0.000309,1.669941,26996.0,1.643494,1.669941,1.615158,0.00000,0.000000,-0.011185,0.001545
...,...,...,...,...,...,...,...,...,...,...
2023-12-27 00:00:00-03:00,0.000240,33.862396,18176500.0,33.592536,33.932360,33.552556,0.00000,0.000000,19.495746,1.420402
2023-12-28 00:00:00-03:00,0.000240,33.952351,12555300.0,33.862397,33.952351,33.722471,0.00000,0.000000,19.549010,1.420642
2024-01-02 00:00:00-03:00,0.000240,33.520000,18948200.0,33.889999,33.889999,33.240002,0.01765,0.000527,19.293316,1.420883
2024-01-03 00:00:00-03:00,0.000240,33.150002,18530900.0,33.509998,33.770000,33.150002,0.00000,0.000000,19.074231,1.421123


In [156]:

def transform_data(stock_df_name, per='None', start=None, end=None):
    # get all major data to the project
    big_data = yf.Ticker(stock_df_name)
    # start_date = '1990-01-01'
    # end_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

    stock_df = big_data.history(period=per, start=start, end=end).copy()

    # Transforming the data to analysis
    columns = ["Close", 'Volume', 'Open', 'High', 'Low', 'Dividends']
    stock_df = pd.DataFrame(stock_df[columns])
    stock_df['Yield'] = stock_df['Dividends'] / stock_df['Close']
    stock_df = stock_df.astype('float')

    df = pd.merge(cdi_df, stock_df, left_index=True,
                  right_index=True, how='outer')
    df['CDI/Month'] = df['CDI/Month'].ffill()
    df = df.iloc[0:]
    df['Return_Stock'] = (df['Close'] + df['Yield'] -
                    df['Close'].shift(1)) / df['Close'].iloc[0]
    df['Return_CDI'] = (df['Close'].iloc[0] * df['CDI/Month'] )  / df['Close'].iloc[0]

    df['Return_CDI'] = df['Return_CDI'].cumsum()
    df['Return_Stock'] = df['Return_Stock'].cumsum()
    df = df.iloc[1:]
    return df



In [157]:
df = transform_data("ITUB4.SA", 'max')
df

Unnamed: 0,CDI/Month,Close,Volume,Open,High,Low,Dividends,Yield,Return_Stock,Return_CDI
2000-12-22 00:00:00-02:00,0.000309,1.615631,23312.0,1.671830,1.671830,1.605713,0.00000,0.000000,-0.043344,0.000618
2000-12-25 00:00:00-02:00,0.000309,1.615631,0.0,1.615631,1.615631,1.615631,0.00000,0.000000,-0.043344,0.000927
2000-12-26 00:00:00-02:00,0.000309,1.615158,7851.0,1.606751,1.617046,1.591544,0.00000,0.000000,-0.043624,0.001236
2000-12-27 00:00:00-02:00,0.000309,1.669941,26996.0,1.643494,1.669941,1.615158,0.00000,0.000000,-0.011185,0.001545
2000-12-28 00:00:00-02:00,0.000309,1.747392,77425.0,1.660495,1.785174,1.633954,0.00000,0.000000,0.034676,0.001854
...,...,...,...,...,...,...,...,...,...,...
2023-12-27 00:00:00-03:00,0.000240,33.862396,18176500.0,33.592536,33.932360,33.552556,0.00000,0.000000,19.495749,1.420402
2023-12-28 00:00:00-03:00,0.000240,33.952351,12555300.0,33.862397,33.952351,33.722471,0.00000,0.000000,19.549013,1.420642
2024-01-02 00:00:00-03:00,0.000240,33.520000,18948200.0,33.889999,33.889999,33.240002,0.01765,0.000527,19.293319,1.420883
2024-01-03 00:00:00-03:00,0.000240,33.150002,18530900.0,33.509998,33.770000,33.150002,0.00000,0.000000,19.074233,1.421123
