**setting up the directory**

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/topic-modelling


/content/drive/MyDrive/topic-modelling


loading packages

In [3]:
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os




**Loading actual prices**

- btc

In [4]:
btc_prices = pd.read_csv('price_datasets/btc_parsed.csv')
btc_prices

Unnamed: 0,date,datetime_iso,timezone,symbol,price,performance by day
0,2025-08-19,2025-08-19T00:00:00+00:00,UTC,BTCUSDT,116209.95,
1,2025-08-20,2025-08-20T00:00:00+00:00,UTC,BTCUSDT,112949.21,-2.8059
2,2025-08-21,2025-08-21T00:00:00+00:00,UTC,BTCUSDT,114210.36,1.11656
3,2025-08-22,2025-08-22T00:00:00+00:00,UTC,BTCUSDT,112548.06,-1.45547
4,2025-08-23,2025-08-23T00:00:00+00:00,UTC,BTCUSDT,116928.37,3.89195
5,2025-08-24,2025-08-24T00:00:00+00:00,UTC,BTCUSDT,115400.01,-1.30709
6,2025-08-25,2025-08-25T00:00:00+00:00,UTC,BTCUSDT,113459.47,-1.68158
7,2025-08-26,2025-08-26T00:00:00+00:00,UTC,BTCUSDT,109973.76,-3.07221
8,2025-08-27,2025-08-27T00:00:00+00:00,UTC,BTCUSDT,111731.96,1.59875
9,2025-08-28,2025-08-28T00:00:00+00:00,UTC,BTCUSDT,111240.0,-0.4403


- eth

In [5]:
eth_prices = pd.read_csv('price_datasets/eth_parsed.csv')
eth_prices

Unnamed: 0,date,datetime_iso,timezone,symbol,price,performance
0,2025-08-19,2025-08-19T00:00:00+00:00,UTC,ETHUSDT,4309.58,
1,2025-08-20,2025-08-20T00:00:00+00:00,UTC,ETHUSDT,4082.59,-5.2671
2,2025-08-21,2025-08-21T00:00:00+00:00,UTC,ETHUSDT,4331.4,6.09442
3,2025-08-22,2025-08-22T00:00:00+00:00,UTC,ETHUSDT,4229.08,-2.36228
4,2025-08-23,2025-08-23T00:00:00+00:00,UTC,ETHUSDT,4829.52,14.19789
5,2025-08-24,2025-08-24T00:00:00+00:00,UTC,ETHUSDT,4781.57,-0.99285
6,2025-08-25,2025-08-25T00:00:00+00:00,UTC,ETHUSDT,4781.05,-0.01088
7,2025-08-26,2025-08-26T00:00:00+00:00,UTC,ETHUSDT,4368.97,-8.61903
8,2025-08-27,2025-08-27T00:00:00+00:00,UTC,ETHUSDT,4596.62,5.21061
9,2025-08-28,2025-08-28T00:00:00+00:00,UTC,ETHUSDT,4500.53,-2.09045


rename `performance` to `performance by day` for consistency

In [6]:
eth_prices = eth_prices.rename(columns={"performance": "performance by day"})


**read all the `daily_signal` dfs we have generated**

In [7]:
# folder where the CSVs are stored
folder_path = "predicted_signals"

# read all CSV files into a dictionary of DataFrames
dfs = {}
for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)

        # drop "Unnamed: 0" if it exists (i forgot to set index=False when saving)
        if "Unnamed: 0" in df.columns:
            df = df.drop(columns=["Unnamed: 0"])

        dfs[file.replace(".csv", "")] = df


btc_bart_daily = dfs["btc_bart_daily_signals"]  # aggregated daily signals derived from x_signal_df
# btc_bart_df = dfs["btc_bart_signal_df"] # this is referring to the full df with ALL the comments/posts (shows us the individual post labels)
btc_distilbert_daily = dfs["btc_distilbert_daily_signals"]
# btc_distilbert_df = dfs["btc_distilbert_signal_df"]

eth_bart_daily = dfs["eth_bart_daily_signals"]
# eth_bart_df = dfs["eth_bart_signal_df"]
eth_distilbert_daily = dfs["eth_distilbert_daily_signals"]
# eth_distilbert_df = dfs["eth_distilbert_signal_df"]


a preview of what the `daily_signal` df looks like

In [8]:
btc_bart_daily

Unnamed: 0,date,daily_signal,signal_strength,buy_ratio,sell_ratio,hold_ratio,total_posts,avg_confidence,total_score,model_used
0,2025-08-09,NEUTRAL,-0.0,-0.0,-0.0,,166,0.806705,1086.0,bart
1,2025-08-07,NEUTRAL,-0.0,-0.0,-0.0,,155,0.807233,859.0,bart
2,2025-08-12,NEUTRAL,-0.0,-0.0,-0.0,,234,0.794486,883.0,bart
3,2025-08-10,NEUTRAL,-0.0,-0.0,-0.0,,216,0.802003,1278.0,bart
4,2025-08-06,NEUTRAL,-0.0,,,,1336,0.806013,7760.0,bart
5,2025-08-13,NEUTRAL,-0.0,-0.0,-0.0,,232,0.789588,832.0,bart
6,2025-08-11,NEUTRAL,-0.0,-0.0,-0.0,,236,0.797907,1401.0,bart
7,2025-08-08,NEUTRAL,-0.0,-0.0,-0.0,,162,0.806438,970.0,bart
8,2025-08-24,NEUTRAL,-0.0,,-0.0,-0.0,143,0.80399,545.0,bart
9,2025-08-29,NEUTRAL,-0.0,-0.0,,,139,0.786081,608.0,bart


**merging actual daily `prices` df with our `daily_signal` df to compare**

- helper function

In [9]:

def merge_signals_with_prices(prices_df, signals_df,
                              price_comparison_col="performance by day",
                              signal_col="daily_signal"):
    """
    Merge price data with daily signals on 'date'.

    Args:
        prices_df (pd.DataFrame): DataFrame with price data, must contain ['date', price_col].
        signals_df (pd.DataFrame): DataFrame with signal data, must contain ['date', signal_col].
        price_comparison_col (str): Column name in prices_df representing performance.
        signal_col (str): Column name in signals_df representing daily signal.

    Returns:
        pd.DataFrame: Merged DataFrame with ['date', signal_col, performance_by_day].
    """
    # subset only needed columns
    prices_subset = prices_df[['date', price_comparison_col]].copy()
    signals_subset = signals_df[['date', signal_col]].copy()

    # rename price column
    prices_subset = prices_subset.rename(columns={price_comparison_col: "performance_by_day"})

    # merge (left join to keep all signals)
    merged_df = signals_subset.merge(prices_subset, on="date", how="left")

    return merged_df


In [10]:
btc_bart_merged = merge_signals_with_prices(btc_prices, dfs["btc_bart_daily_signals"])
btc_distilbert_merged = merge_signals_with_prices(btc_prices, dfs["btc_distilbert_daily_signals"])

eth_bart_merged = merge_signals_with_prices(eth_prices, dfs["eth_bart_daily_signals"])
eth_distilbert_merged = merge_signals_with_prices(eth_prices, dfs["eth_distilbert_daily_signals"])


**btc**

- `btc_bart`

In [11]:
btc_bart_merged

Unnamed: 0,date,daily_signal,performance_by_day
0,2025-08-09,NEUTRAL,
1,2025-08-07,NEUTRAL,
2,2025-08-12,NEUTRAL,
3,2025-08-10,NEUTRAL,
4,2025-08-06,NEUTRAL,
5,2025-08-13,NEUTRAL,
6,2025-08-11,NEUTRAL,
7,2025-08-08,NEUTRAL,
8,2025-08-24,NEUTRAL,-1.30709
9,2025-08-29,NEUTRAL,1.10389


- `btc_distilbert`

In [12]:
btc_distilbert_merged

Unnamed: 0,date,daily_signal,performance_by_day
0,2025-08-09,NEUTRAL,
1,2025-08-07,NEUTRAL,
2,2025-08-12,NEUTRAL,
3,2025-08-10,NEUTRAL,
4,2025-08-06,NEUTRAL,
5,2025-08-13,NEUTRAL,
6,2025-08-11,NEUTRAL,
7,2025-08-08,NEUTRAL,
8,2025-08-24,NEUTRAL,-1.30709
9,2025-08-29,NEUTRAL,1.10389


**eth**

- `eth_bart`

In [13]:
eth_bart_merged

Unnamed: 0,date,daily_signal,performance_by_day
0,2025-08-13,HOLD,
1,2025-08-06,HOLD,
2,2025-08-07,HOLD,
3,2025-08-11,HOLD,
4,2025-08-09,HOLD,
5,2025-08-08,HOLD,
6,2025-08-12,HOLD,
7,2025-08-10,NEUTRAL,
8,2025-08-30,HOLD,-3.38419
9,2025-09-02,HOLD,-1.79569


In [14]:
eth_distilbert_merged

Unnamed: 0,date,daily_signal,performance_by_day
0,2025-08-13,BUY,
1,2025-08-06,SELL,
2,2025-08-07,SELL,
3,2025-08-11,BUY,
4,2025-08-09,HOLD,
5,2025-08-08,SELL,
6,2025-08-12,BUY,
7,2025-08-10,NEUTRAL,
8,2025-08-30,BUY,-3.38419
9,2025-09-02,BUY,-1.79569


^ i think distilbert is worse..