In [1]:
!pip install finnhub-python
!pip install datasets #IMO ignore pyarrow conflicts on Colab

Collecting finnhub-python
  Downloading finnhub_python-2.4.20-py3-none-any.whl.metadata (9.0 kB)
Downloading finnhub_python-2.4.20-py3-none-any.whl (11 kB)
Installing collected packages: finnhub-python
Successfully installed finnhub-python-2.4.20
Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloadi

In [2]:
#get some data

import finnhub
import pandas as pd

#collect
client = finnhub.Client(api_key='cqnr07hr01qo8864qbpgcqnr07hr01qo8864qbq0')
df = pd.DataFrame()
for ticker in ['AAPL', 'MSFT']:
    data = client.company_news(ticker, _from='2024-01-01', to='2024-01-02') #YYYY-MM-DD
    df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)

#clean
df = df[['related', 'headline', 'summary']].rename(columns={'related': 'ticker', 'summary': 'preview'})
df['sentiment'] = 'Positive' #just for debugging
df

Unnamed: 0,ticker,headline,preview,sentiment
0,AAPL,Apple Downgrades Are Piling Up. The Latest One...,Apple stock registered one of its biggest dai...,Positive
1,AAPL,Apple stock down 3.6% as Barclays says iPhone ...,Barclays analyst Tim Long says Apple's iPhone ...,Positive
2,AAPL,"Apple, Chevron, Citigroup: Trending Tickers",Apple (AAPL) shares slid after Barclays downgr...,Positive
3,AAPL,Map: The 22 states where the minimum wage rose...,The federal minimum wage in the US hasn’t chan...,Positive
4,AAPL,Apple shares slide following Barclays downgrad...,Apple stock fell on the first trading day of t...,Positive
...,...,...,...,...
166,MSFT,Top 5 Stocks To Consider Avoiding As We Begin ...,S&P 500 outperformed Wall Street strategists' ...,Positive
167,MSFT,"Wedbush's Dan Ives Says, 'Tech Stocks Will Be ...",Looking for stock market analysis and research...,Positive
168,MSFT,Reflections On 2023: Blended 98.4% Total Return,"2023 was an amazing year for stock pickers, wi...",Positive
169,MSFT,The Zen Ten - My Top Picks For 2024,Iâve been publishing the Zen Ten list each D...,Positive


In [3]:
#preprocess data

from huggingface_hub import login
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import Dataset

def format(row, eval=False): #as instruct for simplicity (https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_1)
    return {'text': f"""<|start_header_id|>user<|end_header_id|>
Given the headline and preview of a financial news article, classify the sentiment toward the provided ticker symbol. Respond only with "Positive", "Negative" or "Neutral".
* Ticker: {row['ticker']}
* Headline: {row['headline']}
* Preview: {row['preview']}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
{'' if eval else row['sentiment']+'<|eot_id|>'}"""} #no indentation

def encode(batch, tokenizer):
    return tokenizer(batch['text'], return_tensors='np') #https://huggingface.co/docs/datasets/nlp_process#map

login('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') #https://huggingface.co/docs/hub/en/models-gated#download-files

#tokenizer 1 (for fine-tuning)
model_name_1 = 'meta-llama/Meta-Llama-3.1-8B'
tokenizer_1 = AutoTokenizer.from_pretrained(
    model_name_1,
    pad_token='<|finetune_right_pad_id|>',
    padding_side='left') #https://huggingface.co/docs/transformers/llm_tutorial#wrong-padding-side)

#tokenizer 2 (for comparison)
model_name_2 = 'meta-llama/Meta-Llama-3.1-8B-instruct'
tokenizer_2 = AutoTokenizer.from_pretrained(
    model_name_2,
    pad_token='<|finetune_right_pad_id|>',
    padding_side='left')

#train set
train_set, test_set = Dataset.from_pandas(df).train_test_split(test_size=0.1, seed=42).values()
train_set = train_set.map(format).remove_columns(['ticker', 'headline', 'preview', 'sentiment'])
train_set = train_set.map(lambda batch: encode(batch, tokenizer_1), batched=True).remove_columns(['text'])
train_set, val_set = train_set.train_test_split(test_size=0.1, seed=42).values()

#test sets
test_set = test_set.map(lambda row: format(row, eval=True)).remove_columns(['ticker', 'headline', 'preview'])
test_set_1 = test_set.map(lambda batch: encode(batch, tokenizer_1), batched=True).remove_columns(['text'])
test_set_2 = test_set.map(lambda batch: encode(batch, tokenizer_2), batched=True).remove_columns(['text'])

#show
data_collator_1 = DataCollatorWithPadding(tokenizer_1)
data_collator_2 = DataCollatorWithPadding(tokenizer_2)
print(data_collator_1(train_set[:2])) #dynamic padding applying tokenizer.pad
print(data_collator_2(test_set_2.remove_columns('sentiment')[:2]))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

{'input_ids': tensor([[128000, 128006,    882, 128007,    198,  22818,    279,  32263,    323,
          17562,    315,    264,   6020,   3754,   4652,     11,  49229,    279,
          27065,   9017,    279,   3984,  48087,   7891,     13,  40633,   1193,
            449,    330,  36590,    498,    330,  39589,      1,    477,    330,
          88007,  23811,      9,    350,   5327,     25,  75852,     43,    198,
              9,  11452,   1074,     25,   7054,    220,     20,  80336,   2057,
          21829,  35106,    287,   1666,   1226,  19110,    220,   2366,     19,
            198,      9,  32341,     25,    328,  43945,    220,   2636,    704,
            716,  10365,   9935,   6825,   5388,   1705,      6,  11984,    819,
          63903,    537, 124085,     11,  25270,    279,   1060,    709,    220,
            914,  14697,   7531,    704,   1948,    220,     20,  23301,    311,
           5766,  12096,    520,   1510,   5990,     13, 128009,    198, 128006,
          7819

In [6]:
#download model to fine-tune

from transformers import AutoModelForCausalLM

model_1 = AutoModelForCausalLM.from_pretrained(model_name_1, device_map='auto')
model_1.hf_device_map #https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]



{'model.embed_tokens': 'cpu',
 'model.layers.0': 'cpu',
 'model.layers.1': 'cpu',
 'model.layers.2': 'cpu',
 'model.layers.3': 'cpu',
 'model.layers.4': 'cpu',
 'model.layers.5': 'cpu',
 'model.layers.6': 'cpu',
 'model.layers.7': 'cpu',
 'model.layers.8': 'cpu',
 'model.layers.9': 'cpu',
 'model.layers.10': 'cpu',
 'model.layers.11': 'cpu',
 'model.layers.12': 'cpu',
 'model.layers.13': 'cpu',
 'model.layers.14': 'cpu',
 'model.layers.15': 'cpu',
 'model.layers.16': 'cpu',
 'model.layers.17': 'cpu',
 'model.layers.18': 'cpu',
 'model.layers.19': 'cpu',
 'model.layers.20': 'disk',
 'model.layers.21': 'disk',
 'model.layers.22': 'disk',
 'model.layers.23': 'disk',
 'model.layers.24': 'disk',
 'model.layers.25': 'disk',
 'model.layers.26': 'disk',
 'model.layers.27': 'disk',
 'model.layers.28': 'disk',
 'model.layers.29': 'disk',
 'model.layers.30': 'disk',
 'model.layers.31': 'disk',
 'model.norm': 'disk',
 'model.rotary_emb': 'disk',
 'lm_head': 'disk'}

In [8]:
#fine-tune

#TODO

from transformers import TrainingArguments, Trainer
#from trl import SFTTrainer

train_set = train_set.select(range(2)) #just for debugging faster
val_set = val_set.select(range(2)) #just for debugging faster

args = TrainingArguments(
    output_dir='hf_checkpoints/')

trainer = Trainer(
    model=model_1,
    args=args,
    data_collator=data_collator_1,
    train_dataset=train_set,
    eval_dataset=val_set)

trainer.train()

#TODO SFTT (https://huggingface.co/docs/trl/v0.9.6/en/sft_trainer#trl.SFTTrainer) instead? Ne parlano qua (https://huggingface.co/docs/transformers/trainer#trainer)

#TODO model doesn't fit (offloading with device_map='auto' only works for inference) so cannot debug on Colab (TPU runtime should have enough RAM by they crash dunno why)



RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.

In [6]:
#evaluate on test set

from math import ceil
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay

#TODO

In [5]:
#download model for comparison

model_2 = AutoModelForCausalLM.from_pretrained(model_name_2, device_map='auto')
model_2.hf_device_map

NameError: name 'AutoModelForCausalLM' is not defined

In [8]:
#compare on test set

test_set_2 = test_set_2.select(range(5)) #just for debugging faster
batch_size = 2

#generate
prompts = test_set_2.remove_columns('sentiment')
generated_2 = []
for i in range(ceil(len(test_set_2)/batch_size)):
    batch = data_collator_2(prompts[i*batch_size:(i+1)*batch_size]) #.to('cuda')
    generated_ids = model_2.generate( #https://huggingface.co/docs/transformers/en/main_classes/text_generation
        **batch,
        pad_token_id=tokenizer_2.pad_token_id, #avoids warning
        max_new_tokens=1)
    generated_2 = generated_2 + tokenizer_2.batch_decode(generated_ids[:,-1])

#scores
print('Accuracy:', accuracy_score(test_set_2['sentiment'], generated_2))
print('F1_score:', accuracy_score(test_set_2['sentiment'], generated_2))
print('Confusion matrix:')
_ = ConfusionMatrixDisplay.from_predictions(test_set_2['sentiment'], generated_2)

['Neutral', 'Neutral', 'Negative', 'Negative', 'Neutral']


## TODO random notes

* [Trainer è astrazione di training loop con feature aggiuntive tipo distributed training](https://huggingface.co/docs/transformers/quicktour#trainer---a-pytorch-optimized-training-loop)
* [concat train and val set after finetuning](https://huggingface.co/docs/datasets/process#concatenate)
* su colab con runtime tpu ci sarebbe tanta ram per fittare l'8B ma boh crasha sempre. E gpu fa male calcoli e funziona una volta e poi diventa full e da errore
* `map` non fa return 'pt' e servirebbe fare [così](https://discuss.huggingface.co/t/map-with-a-tokenizer-does-not-return-pytorch-tensors/51723) ma tanto mi fa return 'pt' data_collector
* [dynamic padding](https://huggingface.co/learn/nlp-course/en/chapter3/2#dynamic-padding)
* su trainer passare direttamente DataCollatorWithPadding(tokenizer) è equivalente a passare solo tokenizer. imo meglio passare data_collator così più esplicito e in linea con [quick tour doc](https://huggingface.co/docs/transformers/quicktour)
* [tutorial da repo fingpt](https://ai4finance-foundation.medium.com/beginners-guide-to-fingpt-training-with-lora-chatglm2-6b-9eb5ace7fe99)
* check how a model will be distributed with `device_map='auto'`:
  ```python
  from accelerate import init_empty_weights, infer_auto_device_map
  from transformers import AutoModelForCausalLM
  with init_empty_weights():
      empty_model = AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3.1-8B-instruct')
  print(infer_auto_device_map(empty_model))
  ```
* device_map='auto' distribuisce modello come meglio può (anche su disco se non c'è alternativa migliore) ma solo su singolo nodo
  * se non possiamo usare dgx serve far fittare 405 su singolo epyc ma scarica parte su disco (magari ssd di fast) ma quindi molto lento e a quel punto se sta troppo usa llama 70b
  * con nodi multipli sembra più difficile e non abbiamo molto tempo. O chiedi a gabriel come aveva usato DDP? O prova [accelerate](https://huggingface.co/docs/transformers/accelerate)
* Se vengono troppi neutral con labeling o non vediamo migliorie dopo finetuning rispetto instruct, possiamo labellare con 5 sentiment (aggiungi leggermente positivo e leggermente negativo), ma poi nel dataset metti tutti i leggermente come positivo o negativo e lo usi così per fine tuning.
* device_map='auto' may [understimate space needed in CPU](https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference#limits-and-further-development)
* loading distributed model with device_map='auto' only works for [inference](https://huggingface.co/docs/accelerate/quicktour#big-model-inference). You should use accelerate for training (it's generalization of a lot of methods: ddp, fsdp, deepspeed, ecc.). But since our model fit in single GPU there is no need for it. Maybe just ddp to train faster copying the model over multiple GPUs)
* Maybe you can integrate accelerate with Trainer by subclassing as [here](https://huggingface.co/docs/transformers/trainer#customize-the-trainer) but actually not needed because Trainer has already it's own integration (you just need to provide correct config file, see Transformers doc for each acceleration method)
* Examples of accelerate with [slurm](https://github.com/huggingface/accelerate/tree/main/examples/slurm)
* accelerate should be a wrapper around launchers, (eg. torchrun, mpirun, etc. but probably not srun). [Here](https://github.com/huggingface/accelerate/tree/main/examples/slurm) example of using MPI directly instead (for multi CPU, not CUDA in this case)
