<a href="https://colab.research.google.com/github/SamiAdnanAhmed/FinGPT/blob/master/1_fine_tunning_lamma_model_yfinance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fine Tunning Llama Model using Yfinance Data.

In [1]:
# mouting the drive
# connecting to drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# downloading all necessary pckg
!pip install huggingface_hub
!pip install datasets
!pip install transformers
!pip install loguru -qU
!pip install tokenizers
!pip install langchain -qU
!pip install bitsandbytes -qU
!pip install accelerate==0.30.0
!pip install peft==0.4.0
!pip install trl==0.4.7
!pip install guardrail-ml==0.0.12
!pip install flash-attn --no-build-isolation
!pip install -U FlagEmbedding
!pip install bert-score
!pip install duckduckgo_search
!pip install scikit-learn

Collecting trl==0.4.7
  Using cached trl-0.4.7-py3-none-any.whl (77 kB)
Installing collected packages: trl
  Attempting uninstall: trl
    Found existing installation: trl 0.8.6
    Uninstalling trl-0.8.6:
      Successfully uninstalled trl-0.8.6
Successfully installed trl-0.4.7


In [3]:
# importing the necessary library
import os
from glob import glob
import pandas as pd
import json
import time
import requests
import random
from loguru import logger
import re
import numpy as np
#from huggingface_hub import HfApi, HfFolder

In [4]:
from transformers import(AutoTokenizer,
                         AutoModelForMultipleChoice,
                         AutoModelForCausalLM,
                         AutoTokenizer,

                         GenerationConfig,
                         BitsAndBytesConfig,

                         pipeline,
                         Conversation,
                         logging,
                         )
from datasets import load_dataset
from tokenizers import Tokenizer

import warnings
warnings.filterwarnings("ignore")

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
# importing the library
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
from duckduckgo_search import DDGS
import math
import re

In [6]:
# Step 1: Get the data from get_return() and format_news() function and store it in CSV format.

In [7]:
# For fine-tunning purposes we are going to extract stock info for TESLA, FACEBOOK, APPLE and MICROSOFT and stored it into CSV format for further usage.

In [8]:
def bin_mapping(ret):
    up_down = 'U' if ret >= 0 else 'D'
    integer = math.ceil(abs(100 * ret))
    return up_down + (str(integer) if integer <= 5 else '5+')

def map_bin_label(bin_lb):
    lb = bin_lb.replace('U', 'Up by ')
    lb = lb.replace('D', 'Down by ')
    lb = lb.replace('1', '0-1%')
    lb = lb.replace('2', '1-2%')
    lb = lb.replace('3', '2-3%')
    lb = lb.replace('4', '3-4%')
    if '+' in lb:
        lb = lb.replace('5+', 'More than 5%')
    else:
        lb = lb.replace('5', '4-5%')
    return lb

def get_returns(stock_symbol, start_date, end_date):
    stock_data = yf.download(stock_symbol, start_date, end_date)
    weekly_data = stock_data['Adj Close'].resample('W').ffill()
    weekly_returns = weekly_data.pct_change()[1:]
    weekly_start_prices = weekly_data[:-1]
    weekly_end_prices = weekly_data[1:]

    weekly_data = pd.DataFrame({
        'Start Date': weekly_start_prices.index,
        'Start Price': weekly_start_prices.values,
        'End Date': weekly_end_prices.index,
        'End Price': weekly_end_prices.values,
        'Weekly Returns': weekly_returns.values
    })

    weekly_data['Bin Label'] = weekly_data['Weekly Returns'].map(bin_mapping)
    weekly_data['Mapped Label'] = weekly_data['Bin Label'].apply(map_bin_label)

    return weekly_data

def get_ddg_news_urls(keywords):
    with DDGS() as ddgs:
        ddgs_news_gen = ddgs.news(keywords, region="wt-wt", safesearch="off", timelimit="m", max_results=10)
        ddgs_news = list(ddgs_news_gen)
    return ddgs_news

def format_news(keywords):
    stock_news = get_ddg_news_urls(keywords)
    formatted_news = []
    for article in stock_news:
        formatted_article = f"[Headline]: {article['title']}\n[Summary]: {article['body']}\n"
        formatted_news.append(formatted_article)
    return formatted_news

def get_news(symbol, data):
    news = format_news(symbol)
    data['News'] = [news] * len(data)
    return data

def prepare_data_for_symbol(symbol, data_dir, start_date, end_date):
    data = get_returns(symbol, start_date, end_date)

    data = get_news(symbol, data)

    data.to_csv(f"{data_dir}/{symbol}_{start_date}_{end_date}.csv", index=False)

    return data

In [9]:
# calling the function
stock_symb = ["AAPL", "MSFT", "META", "TSLA", "IBM", "AXP", "AMGN", "BA", "CAT", "CSCO", "CVX", "GS", "HD", "HON", "INTC" ]
for i in stock_symb:
  prepare_data_for_symbol(i, '/content/drive/My Drive/Llama_2_file', '2022-12-31', '2023-05-31')

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

In [10]:
# concate all the files
def load_data(data_dir, symbols, start_date, end_date):
    data_frames = []
    for symbol in symbols:
        file_path = f"{data_dir}/{symbol}_{start_date}_{end_date}.csv"
        df = pd.read_csv(file_path)
        data_frames.append(df)
    return pd.concat(data_frames, ignore_index=True)

In [11]:
# Load your dataset
data_dir = '/content/drive/My Drive/Llama_2_file'
symbols = ["AAPL", "MSFT", "META", "TSLA", "IBM", "AXP", "AMGN", "BA", "CAT", "CSCO", "CVX", "GS", "HD", "HON", "INTC" ]
start_date = '2022-12-31'
end_date = '2023-05-31'

data = load_data(data_dir, symbols, start_date, end_date)

In [12]:
data

Unnamed: 0,Start Date,Start Price,End Date,End Price,Weekly Returns,Bin Label,Mapped Label,News
0,2023-01-08,128.560867,2023-01-15,133.658875,0.039654,U4,Up by 3-4%,"[""[Headline]: Is Apple Inc. (NASDAQ:AAPL) Warr..."
1,2023-01-15,133.658875,2023-01-22,136.743454,0.023078,U3,Up by 2-3%,"[""[Headline]: Is Apple Inc. (NASDAQ:AAPL) Warr..."
2,2023-01-22,136.743454,2023-01-29,144.737579,0.058461,U5+,Up by More than 5%,"[""[Headline]: Is Apple Inc. (NASDAQ:AAPL) Warr..."
3,2023-01-29,144.737579,2023-02-05,153.237564,0.058727,U5+,Up by More than 5%,"[""[Headline]: Is Apple Inc. (NASDAQ:AAPL) Warr..."
4,2023-02-05,153.237564,2023-02-12,150.004776,-0.021097,D3,Down by 2-3%,"[""[Headline]: Is Apple Inc. (NASDAQ:AAPL) Warr..."
...,...,...,...,...,...,...,...,...
310,2023-04-30,30.507719,2023-05-07,30.563614,0.001832,U1,Up by 0-1%,['[Headline]: European labs led by imec to rec...
311,2023-05-07,30.563614,2023-05-14,28.551682,-0.065828,D5+,Down by More than 5%,['[Headline]: European labs led by imec to rec...
312,2023-05-14,28.551682,2023-05-21,29.518198,0.033851,U4,Up by 3-4%,['[Headline]: European labs led by imec to rec...
313,2023-05-21,29.518198,2023-05-28,28.600996,-0.031072,D4,Down by 3-4%,['[Headline]: European labs led by imec to rec...


In [13]:
data["News"][0]

'["[Headline]: Is Apple Inc. (NASDAQ:AAPL) Warren Buffett\'s Favorite Dow Stock?\\n[Summary]: Apple Inc. (NASDAQ:AAPL) is indeed Warren Buffett\'s favorite Dow stock as his firm, Berkshire Hathaway owns 789,368 million of the company shares worth $135.361 billion as of the first quarter of 2024.\\n", "[Headline]: Could An iPhone Supercycle Drive Apple\'s Stock Price to $275 Per Share?\\n[Summary]: Apple\'s (NASDAQ: AAPL) iPhone sales have shown weakness in recent years, leading to speculation about a potential new iPhone supercycle driven by AI features. Is this new supercycle just hopes and dreams or is there legitimacy behind the idea?\\n", \'[Headline]: Apple seeing iPhone stabilization ahead of WWDC: Wedbush\\n[Summary]: Amid rising competition in the Chinese smartphone market, Apple (NASDAQ:AAPL) has tried to defend its market share, via price cuts. That move appears to be working, Wedbush Securities said. Recent supply chain checks indicated signs of iPhone stabilization ...\\n\'

In [14]:
def format_news_text(text):
    # Clean the news text, replacing "\\n" with "\n" and removing unnecessary characters
    cleaned_news = text.strip('[]"').replace("\\n", "\n").replace("\\'", "'")

    # Split the cleaned news into headline and summary using "\\n" as the delimiter
    parts = cleaned_news.split("\\n")

    # Check if there are at least two parts (headline and summary)
    if len(parts) >= 2:
        headline, summary = parts
        return f"<s><INST>[Headlines]: {headline}\n[Summary]: {summary}</INST></s>"
    else:
        # Handle the case where there is only one part (either headline or summary)
        return f"<s><INST>[Headlines]: {parts[0]}\n[Summary]: No summary available</INST></s>"

format_news_fine_tune = data['News'].apply(format_news_text)

In [15]:
format_news_fine_tune.count()

315

In [16]:
format_news_fine_tune[1]

'<s><INST>[Headlines]: Headline]: Is Apple Inc. (NASDAQ:AAPL) Warren Buffett\'s Favorite Dow Stock?\n[Summary]: Apple Inc. (NASDAQ:AAPL) is indeed Warren Buffett\'s favorite Dow stock as his firm, Berkshire Hathaway owns 789,368 million of the company shares worth $135.361 billion as of the first quarter of 2024.\n", "[Headline]: Could An iPhone Supercycle Drive Apple\'s Stock Price to $275 Per Share?\n[Summary]: Apple\'s (NASDAQ: AAPL) iPhone sales have shown weakness in recent years, leading to speculation about a potential new iPhone supercycle driven by AI features. Is this new supercycle just hopes and dreams or is there legitimacy behind the idea?\n", \'[Headline]: Apple seeing iPhone stabilization ahead of WWDC: Wedbush\n[Summary]: Amid rising competition in the Chinese smartphone market, Apple (NASDAQ:AAPL) has tried to defend its market share, via price cuts. That move appears to be working, Wedbush Securities said. Recent supply chain checks indicated signs of iPhone stabiliz

In [17]:
# loading the base_model and tokenizer
base_model = 'NousResearch/Llama-2-7b-chat-hf'
tokenizer = 'NousResearch/Llama-2-7b-chat-hf'

In [18]:
###############
# LORA Parameters
###############

lora_r = 64

# Alpha parameter for Lora scaling
lora_alpha = 16

# Dropout probability for Lora
lora_dropout = 0.1

#########################
# bitsandbytes parameters
#########################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type fp16
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base model
use_nested_quant = False

##############################
# TrainingArguments parameters
##############################

# output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 3

# Enable fp16/bf16 training (setting bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 2

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4

# Enable gradient checkpointing
gradient_checkpointing = False

# Maximum gradient normal
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 1e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

#optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

evaluation_strategy= "steps"

eval_steps= 25

load_best_model_at_end= True

metric_for_best_model= "loss"

#################
# SFT Parameters
#################

# Maximum sequence length to use
max_seq_length = 512

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [19]:
# Load tokeniser and model with LoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

In [20]:
# Set environment variable to manage memory fragmentation
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [21]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [22]:
# Checking GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16, you can accelerate training with the argument --fp16")
        print("=" * 80)

Your GPU supports bfloat16, you can accelerate training with the argument --fp16


In [23]:
# Loading the model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map=device_map,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
# Loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [25]:
# fucntion for calculating evalution matrix
from datasets import load_metric
import numpy as np

accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    labels = labels.reshape(-1)
    predictions = predictions.reshape(-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average = 'weighted')
    recall = recall_metric.compute(predictions=predictions, references=labels, average = 'weighted')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average = 'weighted')
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }

In [26]:
# Load LORA Config
from peft import LoraConfig

peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)

In [27]:
# set training parameters

from transformers import TrainingArguments

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    weight_decay=weight_decay,
    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    report_to = "tensorboard",
    evaluation_strategy=evaluation_strategy,
    eval_steps=eval_steps,
    load_best_model_at_end=load_best_model_at_end,
    metric_for_best_model=metric_for_best_model
)

In [28]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
# Convert the Pandas Series to a DataFrame
df = pd.DataFrame({"text": format_news_fine_tune})

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_dataset = Dataset.from_pandas(train_df)

val_dataset = Dataset.from_pandas(val_df)

In [29]:
train_dataset

Dataset({
    features: ['text', '__index_level_0__'],
    num_rows: 283
})

In [30]:
val_dataset

Dataset({
    features: ['text', '__index_level_0__'],
    num_rows: 32
})

In [31]:
# eval function testing
# Set supervised fine-tunning parameters
!pip install git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=training_arguments,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    packing=packing,
)

Collecting git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c
  Cloning https://github.com/huggingface/trl.git (to revision 7630f877f91c556d9e5a3baa4b6e2894d90ff84c) to /tmp/pip-req-build-vfecxywx
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/trl.git /tmp/pip-req-build-vfecxywx
  Running command git rev-parse -q --verify 'sha^7630f877f91c556d9e5a3baa4b6e2894d90ff84c'
  Running command git fetch -q https://github.com/huggingface/trl.git 7630f877f91c556d9e5a3baa4b6e2894d90ff84c
  Running command git checkout -q 7630f877f91c556d9e5a3baa4b6e2894d90ff84c
  Resolved https://github.com/huggingface/trl.git to commit 7630f877f91c556d9e5a3baa4b6e2894d90ff84c
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: trl
  Building wheel for trl (pyproject.toml) ... [?25l[?

Map:   0%|          | 0/283 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [32]:
# Set supervised fine-tunning parameters

from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=training_arguments,
    tokenizer=tokenizer,
    packing=packing,
)

Map:   0%|          | 0/283 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [None]:
# to start training
trainer.train()

Step,Training Loss,Validation Loss
25,2.1724,1.959954
50,1.7827,1.63492


In [None]:
# save the model
trainer.model.save_pretrained("/content/drive/My Drive/Llama_2_file/fingpt_yf_30_llama2_lora")