# Evaluation of Data Scientist Agents for Finance

## Load results

In [1]:
import pandas as pd
import json

gold_standard = pd.read_csv(
    "/Users/thiagocastroferreira/Desktop/kubernetes/mcp-tutorial/fundamental_analysis/2025-04-17/fundamental_analysis.csv"
)

terms = pd.read_csv("fundamental_analysis/terms.csv")["Termo"].tolist() + [
    "Receita Líquida (3 meses)",
    "EBIT (3 meses)",
    "Lucro Líquido (3 meses)",
]
for i, term in enumerate(terms):
    if term in ["Receita Líquida", "EBIT", "Lucro Líquido"]:
        terms[i] = f"{term} (12 Meses)"

ENERGY_COMPANIES = [
    {"id": "ALUP11", "cnpj": "08.364.948/0001-38", "name": "Alupar Investimento"},
    {"id": "AURE3", "cnpj": "28.594.234/0001-23", "name": "Auren Energia"},
    {
        "id": "CPLE3",
        "cnpj": "76.483.817/0001-20",
        "name": "Companhia Paranaense de Energia",
    },
    {"id": "EGIE3", "cnpj": "02.474.103/0001-19", "name": "Engie Brasil Energia"},
    {"id": "ELET3", "cnpj": "00.001.180/0001-26", "name": "Eletrobrás"},
    {"id": "ENEV3", "cnpj": "04.423.567/0001-21", "name": "Eneva"},
    {"id": "ENGI3", "cnpj": "00.864.214/0001-06", "name": "Energisa"},
    {"id": "EQTL3", "cnpj": "03.220.438/0001-73", "name": "Equatorial"},
    {"id": "ISAE3", "cnpj": "02.998.611/0001-04", "name": "ISA Energia Brasil"},
    {"id": "LIGT3", "cnpj": "03.378.521/0001-75", "name": "Light"},
    {"id": "NEOE3", "cnpj": "01.083.200/0001-18", "name": "Neoenergia"},
    {"id": "RNEW11", "cnpj": "08.534.605/0001-74", "name": "Renova Energia"},
    {"id": "SRNA3", "cnpj": "42.500.384/0001-51", "name": "Serena Energia"},
]
stocks = [s["id"] for s in ENERGY_COMPANIES]

gold_standard.rename(
    columns={
        "Receita Líquida": "Receita Líquida (12 Meses)",
        "EBIT": "EBIT (12 Meses)",
        "Lucro Líquido": "Lucro Líquido (12 Meses)",
    },
    inplace=True,
)
gold_standard = gold_standard[gold_standard["Papel"].isin(stocks)].sort_values(
    by="Papel"
)

## Mean Absolute Error Formula

Clipping error to 100

In [2]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler


def calculate_mae(y_true: list, y_pred: list):
    scaler = MinMaxScaler()
    scaler.fit(np.array(y_true).reshape(-1, 1))

    y_true_scaled = scaler.transform(np.array(y_true).reshape(-1, 1))
    y_pred_scaled = scaler.transform(np.array(y_pred).reshape(-1, 1))

    y_true = y_true_scaled.reshape(1, -1)[0]
    y_pred = y_pred_scaled.reshape(1, -1)[0]

    mae = []
    for i in range(len(y_true)):
        y_true[i] = float(y_true[i])
        y_pred[i] = float(y_pred[i])
        mae += [abs(y_true[i] - y_pred[i])]
    return mae

## Calculating Error

Calculating error for framework, task and server

In [3]:
PATH = "/Users/thiagocastroferreira/Desktop/kubernetes/results"

models = []
for model in [
    "gpt-5-mini/agent",
    "gpt-5-mini/workflow",
    "gpt-5-mini/workflow_new",
    "gpt-4.1-mini/workflow",
    "gpt-4.1-mini/agent",
]:
    path = f"{PATH}/experiments/{model}"

    df = []

    for stock in stocks:
        with open(f"{path}/{stock}_output.json", "r") as f:
            output = json.load(f)

        indicators = {i["indicator"]: i["value"] for i in output["indicators"]}
        indicators["stock"] = stock
        indicators["model"] = model
        for term in terms:
            if term not in indicators:
                indicators[term] = 0
        df.append(indicators)

    df = pd.DataFrame(df)
    models.append((model, df))

In [4]:
from tabulate import tabulate

headers = ["Termo", "Model", "MAE"]
results = []
results_df = {h: [] for h in headers}
for term in terms:
    for model_name, df in models:
        y_pred = df[term].to_list()
        y_gold = gold_standard[term].to_list()

        if len(y_gold) == len(y_pred):
            mae = calculate_mae(y_pred=y_pred, y_true=y_gold)

            results_df["Termo"].append(term)
            results_df["Model"].append(model_name)
            results_df["MAE"].append(mae)

# print(tabulate(results, headers=headers, tablefmt="grid"))
results_df = pd.DataFrame(results_df)

Evaluation of Models

In [5]:
grouped = results_df[["Model", "MAE"]].groupby(by=["Model"])

for group_keys, group_df in grouped:
    r = []
    for mae_list in group_df["MAE"]:
        r.extend(mae_list)
    print(group_keys, round(sum(r) / len(r), 2))

('gpt-4.1-mini/agent',) 5.84
('gpt-4.1-mini/workflow',) 9.9
('gpt-5-mini/agent',) 6.12
('gpt-5-mini/workflow',) 1.53
('gpt-5-mini/workflow_new',) 0.59


Evaluation of Models per Term

In [11]:
headers = [
    "Termo",
    "Agent 4.1-mini",
    "Agent 5-mini",
    "Workflow 4.1-mini",
    "Workflow 5-mini",
    "Workflow 5-mini NEW",
]
grouped = results_df[["Model", "Termo", "MAE"]].groupby(by=["Model", "Termo"])

terms_mae = {}
map2model = {
    "gpt-4.1-mini/agent": "Agent 4.1-mini",
    "gpt-4.1-mini/workflow": "Workflow 4.1-mini",
    "gpt-5-mini/agent": "Agent 5-mini",
    "gpt-5-mini/workflow": "Workflow 5-mini",
    "gpt-5-mini/workflow_new": "Workflow 5-mini NEW",
}
for group_keys, group_df in grouped:
    r = []
    for mae_list in group_df["MAE"]:
        r.extend(mae_list)
    model, term = group_keys
    mae = round(sum(r) / len(r), 2)
    if term not in terms_mae:
        terms_mae[term] = {}

    terms_mae[term][map2model[model]] = mae

results = []
for term in terms_mae:
    results.append(
        [
            term,
            terms_mae[term]["Agent 4.1-mini"],
            terms_mae[term]["Agent 5-mini"],
            terms_mae[term]["Workflow 4.1-mini"],
            terms_mae[term]["Workflow 5-mini"],
            terms_mae[term]["Workflow 5-mini NEW"],
        ]
    )

print(tabulate(results, headers=headers))

Termo                         Agent 4.1-mini    Agent 5-mini    Workflow 4.1-mini    Workflow 5-mini    Workflow 5-mini NEW
--------------------------  ----------------  --------------  -------------------  -----------------  ---------------------
Ativo                                   0.07            0                    0                  0                      0
Ativo Circulante                        0.03            0                    0                  0                      0
Disponibilidades                        0.07            0.02                 0                  0.02                   0.03
Div Br/ Patrim                          0.37            0.08                 0.12               0.08                   0.07
Dív. Bruta                              0.18            0                    0.1                0                      0
Dív. Líquida                            0.29            0.02                 0.18               0.01                   0.03
EBIT (12 Meses)  

Checking examples

In [7]:
model = "gpt-4.1-mini/workflow"
stock = "LIGT3"

path = f"{PATH}/experiments/{model}"
with open(f"{path}/{stock}_output.json", "r") as f:
    output = json.load(f)

indicators = {i["indicator"]: i["value"] for i in output["indicators"]}

stock_gold_standard = gold_standard[(gold_standard["Papel"] == stock)]

print("Price: ", float(stock_gold_standard["Cotação"]))
print("Number Stocks: ", float(stock_gold_standard["Nro. Ações"]))

headers = ["Termo", "Gold", "Pred"]
results = []
for term in terms:
    gold_term = float(stock_gold_standard[term])
    try:
        pred_term = indicators[term]
    except Exception:
        pred_term = 0
    results.append([term, gold_term, pred_term])

print(tabulate(results, headers=headers))

Price:  4.67
Number Stocks:  372555000.0
Termo                               Gold          Pred
--------------------------  ------------  ------------
Ativo                        2.53437e+10   2.53437e+10
Disponibilidades             3.08952e+09   3.08952e+09
Ativo Circulante             7.15898e+09   7.15898e+09
Dív. Bruta                   9.50584e+09   9.50584e+09
Dív. Líquida                 6.41632e+09   6.41632e+09
Patrim. Líq                  5.21846e+09   5.21846e+09
Receita Líquida (12 Meses)   1.48763e+10   1.48763e+10
EBIT (12 Meses)              1.40472e+09   1.17706e+09
Lucro Líquido (12 Meses)     1.64378e+09   1.64378e+09
P/L                          1.06          1.05844
P/VP                         0.33          0.3334
P/EBIT                       1.24          1.47812
PSR                          0.12          0.116953
P/Ativos                     0.07          0.0686496
P/Cap. Giro                  0.82          0.818843
P/Ativ Circ Liq             -0.13          0.

  print("Price: ", float(stock_gold_standard["Cotação"]))
  print("Number Stocks: ", float(stock_gold_standard["Nro. Ações"]))
  gold_term = float(stock_gold_standard[term])
