In [1]:
# import required libraries
import logging
import os
import time
from dataclasses import asdict, dataclass
from pathlib import Path

import polars as pl
import torch
from datasets import Dataset, load_from_disk
from polars import col as c
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig, TextStreamer, TrainingArguments)
from trl import SFTTrainer
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template, train_on_responses_only

# set working directory
os.chdir('/home/yuzhu/synology/projects/Call/call/code/v4/reproduce-finetune')

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


## Hyperparameters

In [2]:
finetune_or_inference="finetune"
unsloth=True
model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"  # "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"  # "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
max_seq_length=15000
max_new_tokens=1000
chat_template="llama-3.1"  # "mistral" or "llama-3.1"
train_data_path="data/train_data.parquet"
test_data_path="data/test_data.parquet"
saved_model_name="saved_model/mistral"

## Prepare Data

Import data for training

In [3]:
# read the huggingface dataset
train_data = pl.read_parquet(train_data_path)
test_data = pl.read_parquet(test_data_path)
logging.info(f"Read data from {train_data_path} and {test_data_path}")

Next we print the first row of the training and testing dataset. Note that the `rank` column is the target variable. It's a number between 1 and 5.

Also note that we create `rank` by classifying the PEAD (measured as cumulative abnormal returns, which is the `car_c5_call_0_21` column) into **equal-sized** quintiles.

The `docid` column is the unique identifier for each earnings call.

In [4]:
# print first row of train_data
train_data.head(1)

# print first row of test_data
# test_data.head(1)

docid,transcriptid,gvkey,rdq,is_sp500,is_r2k,car_c5_call_0_21,car_c5_call_0_0,car_c5_call_0_3,fund_0_90,inst_tr2_0_90,revision_scaled_by_price_90,demand_retail_3,sue3,vol_call_m21_m1,mcap,bm,roa,debt_assets,medest,numest,stdest,turnover_ma21,volume_ma21,rank,text
str,i64,str,date,bool,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,cat,str
"""001004-2020-12-17""",2164018,"""001004""",2020-12-17,False,True,-0.027868,0.023304,0.039343,294.885986,566.546095,0.011817,0.612524,0.004582,0.036532,6.908989,1.250186,0.071822,0.505965,0.18,5.0,0.046693,0.009553,337234.428571,"""2""","""[Management Discussion]: """"""Be…"


Construct prompts using the training and testing data. The output consists of two huggingface datasets: `train_dataset` and `test_dataset`.

In [5]:
# get tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = get_chat_template(
    tokenizer,
    chat_template=chat_template,
    map_eos_token=True,  # e.g., maps <|im_end|> to </s> instead
)


# construct the prompt for training data
train_dataset = []
for row in train_data.iter_rows(named=True):
    # define system
    user_content = f"""You are a financial analyst. You will be given an earnings call transcript of a company and a few financial ratios. Your task is to predict whether the earnings call will have positive or negative impact on the future stock return. Please answer by typing a score between 1 and 5, where 1 is the least positive and 5 is the most positive. Your answer must start with "Score:". Please do not concentrate your predictions on the same score, i.e., the number of stocks falling to each score should be balanced. 
The earnings call may contain three parts: Management Discussion, Questions from Analysts, and Answers from Management. The "Management Discussion" section is a statement from the management (usually CEOs and CFOs) about the past performance and future prospects of the company. The "Questions from Analysts part" is a question from financial analysts and the "Answers from Management part" is the response from the management. There may be multiple rounds of questions and answers in a call. Please also note that the management usually uses very positive language when discussing their company, but you should not take it as granted. Pay attention to the questions from the analysts. 

Now the earnings call transcript begins:
{row["text"]}

Now the financial ratios begin:
- Earnings surprise (normalized by stock price): {row["sue3"]}
- Return volatility in the past month: {row["vol_call_m21_m1"]}
- Market capitalization (log-transformed): {row["mcap"]}
- Book-to-market ratio: {row["bm"]}
- Return-on-assets: {row["roa"]}
- Debt-to-assets: {row["debt_assets"]}
- Median earnings forecast: {row["medest"]}
- Number analysts forecast: {row["numest"]}
- Standard deviation of earnings forecast: {row["stdest"]}
- Turnover in the past month: {row["turnover_ma21"]}
- Trading volume in the past month: {row["volume_ma21"]}"""

    # define user and assistant messages
    user = {
        "docid": row["docid"],
        "role": "user",
        "content": user_content.strip(),
    }
    assistant = {
        "docid": row["docid"],
        "role": "assistant",
        "content": f'Score:{row["rank"].strip()}',
    }

    # create prompt based on finetune_or_inference
    if finetune_or_inference == "finetune":
        train_dataset.append([user, assistant])
    elif finetune_or_inference == "inference":
        train_dataset.append([user])

# convert to HF Dataset
train_dataset = Dataset.from_dict({"chat": train_dataset})

# apply chat template
train_dataset = train_dataset.map(
    lambda x: {
        "prompt": tokenizer.apply_chat_template(
            x["chat"], tokenize=False, add_generation_prompt=False
        )
    },
    batched=True,
)
train_dataset_for_inference = train_dataset.map(
    lambda x: {
        "input_ids": tokenizer.apply_chat_template(
            x["chat"], tokenize=True, add_generation_prompt=True, return_tensors="np", max_length=max_seq_length
        )
    },
    batched=True,
)
train_dataset_for_inference.set_format(type="torch", columns=["input_ids"])

# Repeat the same process for test data
test_dataset = []
for row in test_data.iter_rows(named=True):
    # Same code as above for test data
    user_content = f"""You are a financial analyst. You will be given an earnings call transcript of a company and a few financial ratios. Your task is to predict whether the earnings call will have positive or negative impact on the future stock return. Please answer by typing a score between 1 and 5, where 1 is the least positive and 5 is the most positive. Your answer must start with "Score:". Please do not concentrate your predictions on the same score, i.e., the number of stocks falling to each score should be balanced. 
The earnings call may contain three parts: Management Discussion, Questions from Analysts, and Answers from Management. The "Management Discussion" section is a statement from the management (usually CEOs and CFOs) about the past performance and future prospects of the company. The "Questions from Analysts part" is a question from financial analysts and the "Answers from Management part" is the response from the management. There may be multiple rounds of questions and answers in a call. Please also note that the management usually uses very positive language when discussing their company, but you should not take it as granted. Pay attention to the questions from the analysts. 

Now the earnings call transcript begins:
{row["text"]}

Now the financial ratios begin:
- Earnings surprise (normalized by stock price): {row["sue3"]}
- Return volatility in the past month: {row["vol_call_m21_m1"]}
- Market capitalization (log-transformed): {row["mcap"]}
- Book-to-market ratio: {row["bm"]}
- Return-on-assets: {row["roa"]}
- Debt-to-assets: {row["debt_assets"]}
- Median earnings forecast: {row["medest"]}
- Number analysts forecast: {row["numest"]}
- Standard deviation of earnings forecast: {row["stdest"]}
- Turnover in the past month: {row["turnover_ma21"]}
- Trading volume in the past month: {row["volume_ma21"]}"""

    user = {
        "docid": row["docid"],
        "role": "user",
        "content": user_content,
    }
    assistant = {
        "docid": row["docid"],
        "role": "assistant",
        "content": f'Score:{row["rank"]}',
    }

    if finetune_or_inference == "finetune":
        test_dataset.append([user, assistant])
    elif finetune_or_inference == "inference":
        test_dataset.append([user])

test_dataset = Dataset.from_dict({"chat": test_dataset})

test_dataset = test_dataset.map(
    lambda x: {
        "prompt": tokenizer.apply_chat_template(
            x["chat"], tokenize=False, add_generation_prompt=False
        )
    },
    batched=True,
)

test_dataset_for_inference = test_dataset.map(
    lambda x: {
        "input_ids": tokenizer.apply_chat_template(x["chat"], tokenize=True, add_generation_prompt=True, return_tensors="np", max_length=max_seq_length)
    },
    batched=True,
)
test_dataset_for_inference.set_format(type="torch", columns=["input_ids"])


Map:   0%|          | 0/6673 [00:00<?, ? examples/s]

Map:   0%|          | 0/6673 [00:00<?, ? examples/s]

Map:   0%|          | 0/1198 [00:00<?, ? examples/s]

Map:   0%|          | 0/1198 [00:00<?, ? examples/s]

## Prepare Model and Tokenizer

Initial model and tokenizer

In [6]:
# Get model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=torch.bfloat16,  # None for autodetect
    load_in_4bit=True,
)

# add LoRA adapter
model = FastLanguageModel.get_peft_model(
    model,
    r=8,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        # "k_proj",
        "v_proj",
        # "o_proj",
        # "gate_proj",
        # "up_proj",
        # "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=985,
    use_rslora=True,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)


==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA RTX 6000 Ada Generation. Max memory: 47.493 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu124. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2024.11.5 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


## Train

In [11]:
# init trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    dataset_text_field="prompt",
    max_seq_length=max_seq_length,
    dataset_num_proc=16,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        # warmup_steps=10,
        warmup_ratio=0.1,
        max_steps=25,  # set to -1 for full training
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=False,
        bf16=True,
        logging_steps=1,
        eval_steps=50,
        optim="adamw_8bit",  # adamw_8bit
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=985,
        output_dir=f"saved_model/{chat_template}",
        save_strategy="no",
        overwrite_output_dir=True,
    ),
)

Map (num_proc=16):   0%|          | 0/6673 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/1198 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [12]:
# (important) only train on assistant responses!
trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/6673 [00:00<?, ? examples/s]

Map:   0%|          | 0/1198 [00:00<?, ? examples/s]

In [9]:
# # only for debugging
# tokenizer.decode(trainer.train_dataset[1]["input_ids"])
# space = tokenizer(" ", add_special_tokens = False).input_ids[0]
# masked = tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[1]["labels"]])
# masked

In [13]:
# train the model
trainer_stats = trainer.train()

# save the model
model.save_pretrained_merged(
    saved_model_name,
    tokenizer,
    save_method="lora",
)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 6,673 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 8
\        /    Total batch size = 8 | Total steps = 25
 "-____-"     Number of trainable parameters = 3,407,872


Step,Training Loss
1,2.2745
2,2.3649
3,2.2873
4,2.2518
5,2.2868
6,2.2676
7,2.2945
8,2.2816
9,2.2499
10,2.2178


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... Done.


## Inference

In [49]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# if not using streamer
results = []
for call, input_ids in tqdm(
    zip(test_dataset_for_inference["chat"], test_dataset_for_inference["input_ids"]), total=len(test_dataset_for_inference)
):

    docid = call[0]["docid"]
    prompt = call[0]["content"]
    input_length = input_ids.shape[0]  # input_ids is a 1D tensor

    # debug only
    # if not (input_length >= 5000):
    #     logging.info(f"Skipping \"{docid}\" ({input_length} tokens)")
    #     continue

    generated_ids = model.generate(
        input_ids=input_ids.unsqueeze(0).to(model.device),
        max_new_tokens=max_new_tokens,
        use_cache=True,
        do_sample=True,
    )
    generated_text = tokenizer.batch_decode(
        generated_ids[:, input_length:], skip_special_tokens=True
    )[0]

    # every output saved as a single file
    out = {"docid": docid, "prompt": prompt, "response": generated_text}
    out = pl.DataFrame(out)
    results.append(out)


# at the end, read all the files and save as a feather file
results = pl.concat(results, how='vertical')

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linea

  0%|          | 0/1198 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  3%|▎         | 34/1198 [07:11<5:10:45, 16.02s/it]

In [97]:
# extrat predictions
preds = (results
    .select(c.docid, pred=c.response.str.extract(r"(?i)Score:?\s*(\d)").cast(pl.Int64))
    .filter(c.pred.is_not_null()))

# merge preds with ground truth
merged = (test_data.join(preds, on="docid", how="inner")
    .select(c.docid, c.pred, target=c.rank.cast(pl.Int64)))

t = merged['target'].to_list()
y = merged['pred'].to_list()

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Calculate overall accuracy
accuracy = accuracy_score(t, y)

# Calculate per-class precision, recall, and f1
# average=None means it will return scores for each class
precision, recall, f1, support = precision_recall_fscore_support(t, y, average=None, labels=[1, 2, 3, 4, 5])

# Print results
print(f"Overall accuracy: {accuracy:.3f}")
print("\nPer-class metrics:")
for i in range(5):
    print(f"\nClass {i+1}:")
    print(f"Precision: {precision[i]:.3f}")
    print(f"Recall: {recall[i]:.3f}")
    print(f"F1-score: {f1[i]:.3f}")
    print(f"Support: {support[i]}")

Overall accuracy: 0.000

Per-class metrics:

Class 1:
Precision: 0.000
Recall: 0.000
F1-score: 0.000
Support: 2.0

Class 2:
Precision: 0.000
Recall: 0.000
F1-score: 0.000
Support: 0.0

Class 3:
Precision: 0.000
Recall: 0.000
F1-score: 0.000
Support: 0.0

Class 4:
Precision: 0.000
Recall: 0.000
F1-score: 0.000
Support: 0.0

Class 5:
Precision: 0.000
Recall: 0.000
F1-score: 0.000
Support: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
