In [2]:
# VesterAI - Notebook 07: Dataset Generation for LLM Fine-Tuning

"""
Goal:
Convert structured sentiment + stock data into instruction-tuning format suitable
for LLM fine-tuning (LLaMA 2, Mistral, etc.).

Format:
- input: Context (structured features)
- instruction: Task description (e.g., Summarize today's market)
- output: Human-annotated or templated summary

Output File: /data/finetune_instructions.jsonl
"""

"\nGoal:\nConvert structured sentiment + stock data into instruction-tuning format suitable\nfor LLM fine-tuning (LLaMA 2, Mistral, etc.).\n\nFormat:\n- input: Context (structured features)\n- instruction: Task description (e.g., Summarize today's market)\n- output: Human-annotated or templated summary\n\nOutput File: /data/finetune_instructions.jsonl\n"

In [3]:
import pandas as pd
import os

# Load processed dataset
df = pd.read_csv("../data/processed/AAPL_model_data.csv", parse_dates=["Date"])

# Drop NA rows just in case
df = df.dropna().reset_index(drop=True)
df = df.sort_values("Date").reset_index(drop=True)

In [4]:
def generate_instruction_record(row):
    # Context: structured data
    context = f"""Date: {row['Date'].strftime('%Y-%m-%d')}
Stock Close: {row['Close']:.2f}
Return: {row['return']:.4f}
Twitter Sentiment: {row['twitter_sentiment']:.2f}
News Sentiment: {row['news_sentiment']:.2f}
Reddit Sentiment: {row.get('reddit_sentiment', 0):.2f}
RSI: {row['rsi_14']:.2f}
MACD: {row['macd']:.4f}
OBV: {row['obv']:.2f}"""

    instruction = "Summarize today's market sentiment and give an investor outlook."

    # Basic templated output for now (you can replace with GPT-generated later)
    sentiment = "bullish" if row["twitter_sentiment"] + row["news_sentiment"] > 0 else "bearish"
    rsi_status = "overbought" if row["rsi_14"] > 70 else "oversold" if row["rsi_14"] < 30 else "neutral"
    trend = "positive" if row["return"] > 0 else "negative"

    output = (
        f"Market sentiment on {row['Date'].strftime('%Y-%m-%d')} is {sentiment}. "
        f"Technical indicators like RSI are {rsi_status}. "
        f"The return was {trend}, suggesting {'a potential upward move' if trend == 'positive' else 'a cautious stance'}."
    )

    return {
        "input": context,
        "instruction": instruction,
        "output": output
    }

In [5]:
import json

output_path = "../data/finetune_instructions.jsonl"
os.makedirs("../data", exist_ok=True)

with open(output_path, "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        record = generate_instruction_record(row)
        f.write(json.dumps(record) + "\n")

print(f"Fine-tuning dataset saved to: {output_path}")

Fine-tuning dataset saved to: ../data/finetune_instructions.jsonl


In [6]:
with open(output_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 3: break
        print(json.loads(line))

{'input': 'Date: 2020-02-25\nStock Close: 69.91\nReturn: -0.0339\nTwitter Sentiment: 0.00\nNews Sentiment: 0.00\nReddit Sentiment: 0.00\nRSI: 30.77\nMACD: -0.3722\nOBV: -82898400.00', 'instruction': "Summarize today's market sentiment and give an investor outlook.", 'output': 'Market sentiment on 2020-02-25 is bearish. Technical indicators like RSI are neutral. The return was negative, suggesting a cautious stance.'}
{'input': 'Date: 2020-02-26\nStock Close: 71.02\nReturn: 0.0159\nTwitter Sentiment: 0.00\nNews Sentiment: 0.00\nReddit Sentiment: 0.00\nRSI: 35.23\nMACD: -0.7218\nOBV: 115156400.00', 'instruction': "Summarize today's market sentiment and give an investor outlook.", 'output': 'Market sentiment on 2020-02-26 is bearish. Technical indicators like RSI are neutral. The return was positive, suggesting a potential upward move.'}
{'input': 'Date: 2020-02-27\nStock Close: 66.38\nReturn: -0.0654\nTwitter Sentiment: 0.00\nNews Sentiment: 0.00\nReddit Sentiment: 0.00\nRSI: 27.30\nMACD

In [7]:
# VesterAI - Notebook 07: Fine-Tuning LLM (LLaMA 2) on Financial Sentiment

"""
This notebook fine-tunes a LLaMA 2 model using your custom instruction dataset:
- Format: Instruction, Input, Output
- Method: PEFT with LoRA for memory-efficient tuning
- Target: Finetune LLaMA 2 (7B or smaller) to generate better financial summaries
"""

'\nThis notebook fine-tunes a LLaMA 2 model using your custom instruction dataset:\n- Format: Instruction, Input, Output\n- Method: PEFT with LoRA for memory-efficient tuning\n- Target: Finetune LLaMA 2 (7B or smaller) to generate better financial summaries\n'

In [8]:
!pip install -q transformers datasets peft accelerate bitsandbytes

In [9]:
from datasets import load_dataset

# Load dataset from JSONL file
dataset = load_dataset("json", data_files="../data/finetune_instructions.jsonl", split="train")

# Preview
dataset[0]

Generating train split: 0 examples [00:00, ? examples/s]

{'input': 'Date: 2020-02-25\nStock Close: 69.91\nReturn: -0.0339\nTwitter Sentiment: 0.00\nNews Sentiment: 0.00\nReddit Sentiment: 0.00\nRSI: 30.77\nMACD: -0.3722\nOBV: -82898400.00',
 'instruction': "Summarize today's market sentiment and give an investor outlook.",
 'output': 'Market sentiment on 2020-02-25 is bearish. Technical indicators like RSI are neutral. The return was negative, suggesting a cautious stance.'}

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Llama-2-7b-hf"  # Or use your local path if already downloaded

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Fix: Assign pad_token if it's missing (common in LLaMA/Mistral)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model with 4-bit quantization and auto GPU mapping
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    trust_remote_code=True
)

Using pad_token, but it is not set yet.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [11]:
def format_instruction(example):
    return {
        "text": f"""### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"""
    }

dataset = dataset.map(format_instruction)

Map:   0%|          | 0/1277 [00:00<?, ? examples/s]

In [12]:
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized_dataset = dataset.map(tokenize, remove_columns=["text", "instruction", "input", "output"])

Map:   0%|          | 0/1277 [00:00<?, ? examples/s]

In [13]:
from peft import get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 3,504,607,232 || trainable%: 0.11967971650867153


In [23]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="../models/llama2_finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=30,
    fp16=True,
    logging_dir="./logs",
    save_strategy="epoch",
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

Step,Training Loss
500,0.3558
1000,0.3049
1500,0.2962
2000,0.2894


TrainOutput(global_step=2400, training_loss=0.307043784459432, metrics={'train_runtime': 5450.8309, 'train_samples_per_second': 7.028, 'train_steps_per_second': 0.44, 'total_flos': 3.9702569391489024e+17, 'train_loss': 0.307043784459432, 'epoch': 30.0})

In [14]:
model.save_pretrained("../models/llama2_finetuned")
tokenizer.save_pretrained("../models/llama2_finetuned")
print("Fine-tuned model saved.")

Fine-tuned model saved.


In [25]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = """### Instruction:
Summarize today's market sentiment and give an investor outlook.

### Input:
Date: 2025-03-20
Stock Close: 182.12
Return: 0.0032
Twitter Sentiment: 0.42
News Sentiment: 0.36
Reddit Sentiment: 0.11
RSI: 58.32
MACD: 0.0012
OBV: 512321000

### Response:
"""

out = pipe(prompt, max_new_tokens=150)[0]["generated_text"]
print(out)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'O

### Instruction:
Summarize today's market sentiment and give an investor outlook.

### Input:
Date: 2025-03-20
Stock Close: 182.12
Return: 0.0032
Twitter Sentiment: 0.42
News Sentiment: 0.36
Reddit Sentiment: 0.11
RSI: 58.32
MACD: 0.0012
OBV: 512321000

### Response:
Stock Close: 182.12
Return: 0.0032
Twitter Sentiment: 0.00
News Sentiment: 0.00
Reddit Sentiment: 0.00
Sentiment: 0.42
TE Technical Signal: 0.00
TW Technical Signal: 0.00
SUM Technical Signal: 0.00
RSI: 58.32
MACD: 0.0012
OBV: 512321000
RSI Stoch Change: 0.00
OBOR Sto
