In [None]:
# !pip freeze

In [None]:
# !pip freeze > requirements.txt

In [1]:
!pip install datasets peft

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType



  match = re.match("^#\s*version\s*([0-9a-z]*)\s*$", line)
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = "ChanceFocus/finma-7b-nlp"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


In [26]:
# Configure LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj"]
)


# Apply LoRA to the model
model = get_peft_model(model, peft_config)

# Load the CSV file
df = pd.read_csv("/home/it/Desktop/RUN/A/Formatted_news_input_LLM.csv")

# Create a dataset from the DataFrame
dataset = Dataset.from_pandas(df)



In [55]:
# Preprocess function without `legacy=False`
def preprocess_function(examples):
    inputs = [f"Prompt: {prompt}\nOutput: {output}" for prompt, output in zip(examples["prompt"], examples["output"])]
    
    # Tokenize with truncation and padding
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    
    # Replace out-of-vocabulary tokens with the unknown token ID (for any IDs exceeding vocab size)
    max_vocab_size = tokenizer.vocab_size
    model_inputs["input_ids"] = [
        [token_id if token_id < max_vocab_size else tokenizer.unk_token_id for token_id in ids]
        for ids in model_inputs["input_ids"]
    ]
    
    # Copy input IDs to labels for training
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

# Proceed with the remaining code
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)


Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Map: 100%|██████████| 132/132 [00:00<00:00, 301.87 examples/s]


In [56]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./finma-7b-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,
    logging_steps=100,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,
)



In [57]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)



In [None]:
# Start training
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./finma-7b-finetuned")
tokenizer.save_pretrained("./finma-7b-finetuned")
