In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Model from Hugging Face hub
base_model = "meta-llama/Meta-Llama-3-8B"

# Fine-tuned model
new_model = "llama-3-8b-chat"

In [4]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 4/4 [00:17<00:00,  4.38s/it]


In [6]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [8]:
training_params = TrainingArguments(
    output_dir="./llama3/results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [10]:
import pandas as pd
import numpy as np
import json

file_path = '../OpenAI/LNG_unsupervised_full.jsonl'
df_input = pd.DataFrame()

with open(file_path, 'r') as infile:
        for line in infile:
            data = json.loads(line)
            user_message = json.loads(data['messages'][1]['content'])  # Parse the user message content as JSON
            user_message_content = data['messages'][1]['content']
            user_message_json = json.loads(user_message_content)
            df_input = df_input.append(pd.json_normalize(user_message_json), ignore_index=True)

df_output = pd.read_csv('../OpenAI/output/unsupervised_outputs_gp4turbo_05010120.csv')

input_json = ""
output_json = ""

system_message_content = "This model is trained to analyze the sentiment of news articles concerning the LNG market and predict the impact on the LNG index's price. Predictions are made based on the news content, current price, and current volatility on the provided date. The model outputs include the direction of the price movement (true for higher, false for lower), the magnitude of this change (ranging from 0.0 to 10.0), and a comment explaining the rationale behind these predictions. The results are provided in the following JSON format: {\"direction\": \"boolean\", \"magnitude\": \"float\",\"comment\": \"string\"}"

df = pd.DataFrame()

for i in range(0, len(df_input)):
    input_json = "{'Date': "+ str(df_input.iloc[i]['date']) + ", 'News': " + str(df_input.iloc[i]['title']) + ", 'Summary': " + str(df_input.iloc[i]['summary']) + ", 'Current Price': " + str(df_input.iloc[i]['price']) + ", 'Annual Volatitlity': " + str(df_input.iloc[i]['vol_annual']) + "}"
    output_json = "{'Direction': "+ str(df_output.iloc[i]['Direction']) + ", 'Magnitude': " + str(df_output.iloc[i]['Magnitude']) + ", 'Comment': " + str(df_output.iloc[i]['Comment']) + "}"
    message = "<s>[INST] <<SYS>>\n" + system_message_content + "\n<</SYS>>\n" + input_json + "[/INST]>"
    reply = output_json
    df = df.append({'message': message, 'reply': reply}, ignore_index=True)

  df_input = df_input.append(pd.json_normalize(user_message_json), ignore_index=True)
  df_input = df_input.append(pd.json_normalize(user_message_json), ignore_index=True)
  df_input = df_input.append(pd.json_normalize(user_message_json), ignore_index=True)
  df_input = df_input.append(pd.json_normalize(user_message_json), ignore_index=True)
  df_input = df_input.append(pd.json_normalize(user_message_json), ignore_index=True)
  df_input = df_input.append(pd.json_normalize(user_message_json), ignore_index=True)
  df_input = df_input.append(pd.json_normalize(user_message_json), ignore_index=True)
  df_input = df_input.append(pd.json_normalize(user_message_json), ignore_index=True)
  df_input = df_input.append(pd.json_normalize(user_message_json), ignore_index=True)
  df_input = df_input.append(pd.json_normalize(user_message_json), ignore_index=True)
  df_input = df_input.append(pd.json_normalize(user_message_json), ignore_index=True)
  df_input = df_input.append(pd.json_normalize(user_me

In [12]:
df_training = df.iloc[0:106]

In [14]:
df_training.iloc[0]['reply']

"{'Direction': False, 'Magnitude': 4.5, 'Comment': The news article indicates a significant shift in the energy market in East Asia due to falling natural gas prices, mirroring earlier trends in the U.S. This could lead to increased volatility and potentially lower prices as the market adjusts to the new energy dynamics. The historical context of similar situations suggests a downward pressure on prices, especially given the current low price and high annual volatility. Therefore, it is likely that the LNG index will experience a decrease in price.}"

In [15]:
from datasets import Dataset
# guanaco_dataset = "mlabonne/guanaco-llama2-1k"
# dataset = load_dataset(guanaco_dataset, split="train")
# dataset = load_dataset("pandas", data_files="./LNG_unsupervised_full_training.csv")
dataset = Dataset.from_pandas(df_training)

In [26]:
def format_chat_template(row):
    row["message"] = tokenizer.apply_chat_template(row["message"], tokenize=False)
    row["reply"] = tokenizer.apply_chat_template(row["reply"], tokenize=False)
    return row

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

dataset = dataset.map(
    format_chat_template,
    num_proc= os.cpu_count(),
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map (num_proc=16):   0%|          | 0/106 [00:01<?, ? examples/s]


NameError: name 'tokenizer' is not defined

In [19]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="message",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

Map: 100%|██████████| 106/106 [00:00<00:00, 2864.81 examples/s]


In [20]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('llama-3-8b-chat\\tokenizer_config.json',
 'llama-3-8b-chat\\special_tokens_map.json',
 'llama-3-8b-chat\\tokenizer.json')

In [21]:
from tensorboard import notebook
log_dir = "results/runs"
notebook.start("--logdir {} --port 4000".format(log_dir))

Reusing TensorBoard on port 4000 (pid 24336), started 2 days, 4:44:55 ago. (Use '!kill 24336' to kill it.)

In [12]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Who is Leonardo Da Vinci?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=2000)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

  attn_output = torch.nn.functional.scaled_dot_product_attention(


<s>[INST] Who is Leonardo Da Vinci? [/INST] Leonardo Da Vinci was an Italian Renaissance artist, scientist, engineer, and inventor. He was born in 1452 and died in 1519. He was the first person to use the scientific method to study the world around him. He was also a great painter and sculptor. He is best known for his painting of the Mona Lisa and his invention of the flying machine.
