In [15]:
import pandas as pd
import json
from datetime import timedelta

In [16]:
master = pd.read_parquet("master_ml_dataset.parquet")

# Leitura segura do JSONL (ignora linhas vazias ou corrompidas)
news_records = []
with open("data/output/investing_news_structured.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue  # pula linhas vazias
        try:
            news_records.append(json.loads(line))
        except json.JSONDecodeError as e:
            print("Linha inválida ignorada:", line[:100], "...")  # mostra parte da linha problemática
            continue

news = pd.DataFrame(news_records)
print(f"Linhas válidas carregadas: {len(news)}")


Linha inválida ignorada: {"raw": {"id": "2908f666-1901-592c-b10c-60f28d574f4f", "datetime": "2021-01-15T21:08:45-03:00", "sou ...
Linhas válidas carregadas: 100309


In [17]:
MANUAL_NEWS_ALIASES = {
    "GOOGL": "GOGL34", "GOOG": "GOGL34", "GOGL35": "GOGL34", "GOGL34": "GOGL34",
    "BRK.B": "BERK34", "BRKB": "BERK34", "BRK-B": "BERK34", "BRK B": "BERK34",
    "GOOG34": "GOGL34", "CRM": "SSFO34", "TSLA": "TSLA34", "META": "M1TA34",
    "NVDA": "NVDC34", "AMZN": "AMZO34", "AAPL": "AAPL34", "MSFT": "MSFT34",
    "JD": "JDCO34", "TSM": "TSMC34", "AVGO": "AVGO34", "BABA": "BABA34",
    "LLY": "LILY34", "V": "VISA34", "JPM": "JPMC34", "XOM": "EXXO34",
    "JNJ": "JNJB34", "MA": "MSCD34", "PG": "PGCO34", "COST": "COWC34",
    "BAC": "BOAC34", "NFLX": "NFLX34", "AMD": "A1MD34", "KO": "COCA34",
    "PEP": "PEPB34", "WMT": "WALM34", "MCD": "MCDC34", "DIS": "DISB34",
    "CAT": "CATP34", "INTC": "ITLC34", "CSCO": "CSCO34", "ORCL": "ORCL34",
    "ADBE": "ADBE34", "NKE": "NIKE34", "SBUX": "SBUB34", "BA": "BOEI34",
    "GS": "GSGI34", "MS": "MSBR34", "F": "FDMO34", "GM": "GMCO34",
    "PFE": "PFIZ34", "CVX": "CHVX34", "PYPL": "PYPL34", "COIN": "C2OI34",
    "UBER": "U1BE34", "ABNB": "A1BN34",
}

In [18]:
# Extrair campos aninhados
news_df = pd.json_normalize(news['raw'])
news_df['sentimento_geral'] = news['structured_event'].apply(lambda x: x['sentimento_geral'])

# Converter datetime
news_df['datetime'] = pd.to_datetime(news_df['datetime'], format='mixed', errors='coerce')


# Normalizar ticker
news_df['ticker_br'] = news_df['ticker'].map(MANUAL_NEWS_ALIASES)

# Remover sem ticker reconhecido
news_df = news_df.dropna(subset=['ticker_br'])

In [19]:
sent_map = {'positivo': 1, 'neutro': 0, 'negativo': -1}
news_df['sent_score'] = news_df['sentimento_geral'].map(sent_map).fillna(0)

In [20]:
# Agrega por data e ticker
news_df['date'] = news_df['datetime'].dt.date
daily_sentiment = (
    news_df.groupby(['ticker_br', 'date'])['sent_score']
    .mean()
    .reset_index()
    .rename(columns={'sent_score': 'sent_diario'})
)

In [21]:
daily_sentiment = (
    daily_sentiment
    .sort_values(['ticker_br', 'date'])
    .assign(
        sent_ult_5d=lambda x: x.groupby('ticker_br')['sent_diario'].transform(lambda s: s.rolling(5, min_periods=1).mean())
    )
)

In [22]:

master['datetime'] = pd.to_datetime(master['datetime'], unit='ms')
master['date'] = master['datetime'].dt.date

# Extrair ticker da perna BR (ex: 'AAPL34_AAPL' -> 'AAPL34')
master['ticker_br'] = master['pair'].str.split('_').str[0]

In [23]:
gold = master.merge(
    daily_sentiment,
    on=['ticker_br', 'date'],
    how='left'
)

In [24]:
gold = gold.rename(columns={
    'sent_diario': 'sentimento_agregado_1d',
    'sent_ult_5d': 'sentimento_agregado_5d'
})


In [25]:
gold.to_parquet("gold_janus_dataset.parquet", index=False)
print("Gold dataset salvo como gold_janus_dataset.parquet")
print("Shape final:", gold.shape)

Gold dataset salvo como gold_janus_dataset.parquet
Shape final: (40249, 54)
