In [7]:
import re
from dataclasses import dataclass, field
from pathlib import Path
from pickle import dump
from typing import Set, List

import pandas as pd


@dataclass
class B3Service:
    """
    B3Service: A service for obtaining B3 tickers from portfolio composition files.

    Args:
    - portfolio_composition_path (Path): The path to the directory containing portfolio composition files.

    Attributes:
    - portfolio_composition_path (Path): The path to the directory containing portfolio composition files.
    """

    portfolio_composition_path: Path
    tickers: Set[str] = field(default_factory=set)

    def _is_valid_ticker(self, ticker: str) -> bool:
        """
        Checks if a ticker is valid.

        Args:
        - ticker (str): Ticker to be validated.

        Returns:
        - bool: True if the ticker is valid, False otherwise.
        """
        return bool(re.match(r"^[A-Z]{4}\d{1,2}$", ticker))

    def _read_tickers_from_file(self, file: Path) -> pd.Series:
        """
        Reads tickers from a given file.

        Args:
        - file (Path): The file from which tickers need to be read.

        Returns:
        - pd.Series: Series containing tickers.
        """
        return pd.read_csv(
            str(file.absolute()), encoding="iso-8859-1", delimiter=";", index_col=False, usecols=[0]
        ).iloc[:, 0]

    def _update_tickers(self, new_tickers: Set[str]) -> None:
        """
        Updates the set of tickers.

        Args:
        - new_tickers (Set[str]): New tickers to be added.
        """
        self.tickers.update(new_tickers)

    def get_b3_tickers(self) -> List[str]:
        """
        Gets B3 tickers from portfolio composition files.

        Returns:
        - Set[str]: Set of B3 tickers.
        """
        for file in self.portfolio_composition_path.iterdir():
            if not file.is_file() or file.suffix != ".csv":
                continue
            
            raw_tickers = self._read_tickers_from_file(file)
            
            valid_tickers = {ticker for ticker in raw_tickers if self._is_valid_ticker(ticker)}
            self._update_tickers(valid_tickers)

        return sorted(self.tickers)
    
    def get_b3_tickers_in_yahoo_format(self) -> List[str]:
        return [f"{ticker}.SA" for ticker in self.get_b3_tickers()]


In [8]:
portfolio_composition_path = Path("data/B3/portfolio_composition")
b3_service = B3Service(portfolio_composition_path)

b3_tickers = b3_service.get_b3_tickers()
yahoo_tickers = b3_service.get_b3_tickers_in_yahoo_format()

tickers = {
    b3_ticker: yahoo_ticker 
    for (b3_ticker, yahoo_ticker) in zip(b3_tickers, yahoo_tickers)
}

tickers

{'AALR3': 'AALR3.SA',
 'ABCB4': 'ABCB4.SA',
 'ABEV3': 'ABEV3.SA',
 'AERI3': 'AERI3.SA',
 'AESB3': 'AESB3.SA',
 'AGRO3': 'AGRO3.SA',
 'ALOS3': 'ALOS3.SA',
 'ALPA4': 'ALPA4.SA',
 'ALUP11': 'ALUP11.SA',
 'AMBP3': 'AMBP3.SA',
 'ANIM3': 'ANIM3.SA',
 'ARML3': 'ARML3.SA',
 'ARZZ3': 'ARZZ3.SA',
 'ASAI3': 'ASAI3.SA',
 'AURE3': 'AURE3.SA',
 'AZUL4': 'AZUL4.SA',
 'BBAS3': 'BBAS3.SA',
 'BBDC3': 'BBDC3.SA',
 'BBDC4': 'BBDC4.SA',
 'BBSE3': 'BBSE3.SA',
 'BEEF3': 'BEEF3.SA',
 'BHIA3': 'BHIA3.SA',
 'BLAU3': 'BLAU3.SA',
 'BMOB3': 'BMOB3.SA',
 'BPAC11': 'BPAC11.SA',
 'BPAN4': 'BPAN4.SA',
 'BRAP4': 'BRAP4.SA',
 'BRFS3': 'BRFS3.SA',
 'BRKM5': 'BRKM5.SA',
 'BRSR6': 'BRSR6.SA',
 'CAML3': 'CAML3.SA',
 'CASH3': 'CASH3.SA',
 'CBAV3': 'CBAV3.SA',
 'CCRO3': 'CCRO3.SA',
 'CEAB3': 'CEAB3.SA',
 'CIEL3': 'CIEL3.SA',
 'CLSA3': 'CLSA3.SA',
 'CMIG3': 'CMIG3.SA',
 'CMIG4': 'CMIG4.SA',
 'CMIN3': 'CMIN3.SA',
 'COGN3': 'COGN3.SA',
 'CPFE3': 'CPFE3.SA',
 'CPLE3': 'CPLE3.SA',
 'CPLE6': 'CPLE6.SA',
 'CRFB3': 'CRFB3.SA',
 'CSAN

In [9]:
with open("data/tickers.pkl", "wb") as file:
    dump(tickers, file)