In [1]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

    PyTorch 2.3.0+cu121 with CUDA 1201 (you have 2.2.1+cu121)
    Python  3.10.14 (you have 3.10.12)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
import pandas as pd
from datasets import load_dataset
from pprint import pprint

def extract_unique_companies():
    # Load the dataset from the Hugging Face Hub
    dataset = load_dataset("Shadow-Blade/financialNews")

    # Convert the loaded dataset to a pandas DataFrame
    df = pd.DataFrame(dataset['train'])

    # Drop duplicates to find unique companies based on the 'company' column
    unique_companies = df.drop_duplicates(subset=['company'])

    # Create a dictionary from the DataFrame with company names as keys and tickers as values
    company_ticker_dict = dict(zip(unique_companies['company'], unique_companies['ticker']))

    return company_ticker_dict

# Call the function and print the result
company_ticker_dict = extract_unique_companies()
# pprint(company_ticker_dict)


In [5]:
from datasets import load_dataset, DatasetDict

# Define the prompt template and the required format for the response
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
{response}
"""

instruction = """A news story about {company} in the {industry} industry, of the {sector} sector, with stock ticker {ticker} was posted on {datetime}, titled : {title}"""

input = """Here's the description provided: {description}. Based on this information, predict whether the stock price will increase, decrease, or remain the same."""

response = """{predicted_change_label}"""


EOS_TOKEN = tokenizer.eos_token
# Function to format each example in the dataset
def formatting_financial_news_func(examples, is_test=False):
    texts = []
    for idx in range(len(examples['company'])):
        if is_test:
            predicted_change_label = ""
        else:
            change_pct = examples['change_pct'][idx]
            if change_pct < -0.5:
                predicted_change_label = "decreases"
            elif change_pct > 0.5:
                predicted_change_label = "increases"
            else:
                predicted_change_label = "remains the same"

        instruction_set = instruction.format(
            company=examples['company'][idx],
            ticker=examples['ticker'][idx],
            sector=examples['sector'][idx],
            industry=examples['industry'][idx],
            datetime=examples['datetime'][idx],
            title=examples['title'][idx]
        )

        input_set = input.format(description=examples['description'][idx])

        response_set = response.format(predicted_change_label=predicted_change_label)

        # Prepare the text with the EOS_TOKEN
        text = alpaca_prompt.format(
            instruction=instruction_set,
            input=input_set,
            response=response_set
        ) + EOS_TOKEN # EOS_TOKEN is assumed to be added in the tokenizer later
        texts.append(text)
    return {"text": texts}

# Load the dataset and split it
dataset = load_dataset("Shadow-Blade/financialNews", split="train")
train_test_split = dataset.train_test_split(test_size=0.2)

# Create a DatasetDict to manage the splits easily
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

# Apply the formatting function to the dataset in a batched fashion
train_dataset = dataset_dict['train'].map(lambda x: formatting_financial_news_func(x, is_test=False), batched=True)
test_dataset = dataset_dict['test'].map(lambda x: formatting_financial_news_func(x, is_test=True), batched=True)


In [6]:
# prompt: print an example of formatted dataset
from pprint import pprint
pprint(train_dataset[0])


{'change_pct': -0.8772224734513856,
 'company': "McDonald's Corporation",
 'datetime': 'Wed 04 May 2022, 05:08PM',
 'description': '(Bloomberg) -- Activist investor Carl Icahn argues his two '
                'nominees for the board of McDonald’s Corp. will help wean the '
                'fast-food chain off its dependence on meat products and '
                'address other concerns he has with the company and its supply '
                'chain.Most Read from BloombergFed Hikes Rates Half-Point as '
                'Powell Signals Similar Moves AheadRussia Seeks to Annex '
                'Occupied Ukraine as Invasion Goals ShiftAs Putin Gets '
                'Desperate, U.S. Should Remember Pearl HarborBiden’s Team Eyes '
                '$125,000 Income Cutoff for St',
 'industry': 'Restaurants',
 'sector': 'Consumer Cyclical',
 'text': 'Below is an instruction that describes a task, paired with an input '
         'that provides further context. Write a response that appropriat

In [7]:
# args = TrainingArguments(
#     per_device_train_batch_size=2,
#     gradient_accumulation_steps=4,
#     warmup_steps=5,
#     max_steps=10,
#     learning_rate=2e-4,
#     fp16=not torch.cuda.is_bf16_supported(),
#     bf16=torch.cuda.is_bf16_supported(),
#     logging_steps=1,
#     optim="adamw_8bit",
#     weight_decay=0.01,
#     lr_scheduler_type="linear",
#     seed=3407,
#     output_dir="outputs",
# )

# # Now use CustomSFTTrainer instead of the default SFTTrainer
# trainer = CustomSFTTrainer(
#     model=model,
#     tokenizer=tokenizer,
#     train_dataset=dataset,
#     dataset_text_field="text",
#     max_seq_length=128,
#     dataset_num_proc=2,
#     packing=False,
#     args=args,
# )

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/76367 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [9]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA L4. Max memory = 22.168 GB.
5.605 GB of memory reserved.


In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 76,367 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.9374
2,2.7996
3,2.7193
4,2.6217
5,2.4802
6,2.0849
7,1.9087
8,1.6465
9,1.3489
10,1.2489


In [11]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

192.9218 seconds used for training.
3.22 minutes used for training.
Peak reserved memory = 8.381 GB.
Peak reserved memory for training = 2.776 GB.
Peak reserved memory % of max memory = 37.807 %.
Peak reserved memory for training % of max memory = 12.523 %.


In [12]:
import random
company_ticker_dict_10 = {k: v for k, v in company_ticker_dict.items() if random.random() > 0.98}

print(company_ticker_dict_10)


{'Comcast Corporation': 'CMCSA', 'MetLife, Inc.': 'MET', 'NextEra Energy, Inc.': 'NEE', 'Colgate-Palmolive Company': 'CL'}


In [19]:
def parse_prediction(prediction):
    """
    Extracts the actual predicted response from the generated text output.
    Assumes the prediction is the last line of the generated text.
    """
    # Assuming prediction is a list containing a single string
    prediction_text = prediction[0]
    response_start = prediction_text.find("### Response:\n\n") + len("### Response:\n\n")
    response_end = prediction_text.find("\n", response_start)
    response_end = response_end if response_end != -1 else None
    return prediction_text[response_start:response_end].strip()


def getPortfolio(datetime, title, description, sector, industry):
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
  results = []
  for company, ticker in company_ticker_dict.items():
    instruction_inference = instruction.format(
        company=company,
        ticker=ticker,
        sector=sector,
        industry=industry,
        datetime=datetime,
        title=title
        )

    input_inference = input.format(
        description=description
        )

    response_inference = response.format(
        predicted_change_label=""
        )

    inputs = tokenizer(
    [
            alpaca_prompt.format(
                instruction=instruction_inference,
                input=input_inference,
                response=response_inference,
        )
    ], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 5, use_cache = True)
    prediction = tokenizer.batch_decode(outputs)
    results.append({
        "company_name": company,
        "ticker": ticker,
        "predicted_change_label": parse_prediction(prediction),
        # "predicted_change_label": (prediction)
    })
  return results



In [22]:
testData = {
    'datetime': 'Mon, Apr 29, 2024, 1:59 PM PDT',
    'title': "Tesla bounces back, SoFi down on Q2 forecast: Market check",
    'description': "US Equities (^GSPC, ^DJI, ^IXIC) recovered toward the end of the day as news broke concerning the US Treasury's larger-than-expected borrowing plans. Shares of Tesla (TSLA) continue to trade higher, with a 15% gain for the day as the market priced in China's approval of the company's 'Full Self-Driving' technology.Yahoo Finance's Jared Blikre joins Market Domination Overtime to analyze the market's movements as the closing bell rings",    'sector': 'Technology',
    'industry': 'Consumer Cyclical,Auto Manufacturers'
}

portfolio = getPortfolio(
    datetime=testData['datetime'],
    title=testData['title'],
    description=testData['description'],
    sector=testData['sector'],
    industry=testData['industry']
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [24]:
top_companies = [result for result in portfolio if result["predicted_change_label"] == "increases"]

print(len(top_companies))
print(len(portfolio))

97
97


In [26]:
portfolio[:10]

[{'company_name': 'Microsoft Corporation',
  'ticker': 'MSFT',
  'predicted_change_label': 'increases'},
 {'company_name': 'Costco Wholesale Corporation',
  'ticker': 'COST',
  'predicted_change_label': 'increases'},
 {'company_name': 'Amazon.com, Inc.',
  'ticker': 'AMZN',
  'predicted_change_label': 'increases'},
 {'company_name': 'Bristol-Myers Squibb Company',
  'ticker': 'BMY',
  'predicted_change_label': 'increases'},
 {'company_name': 'Alphabet Inc.',
  'ticker': 'GOOGL',
  'predicted_change_label': 'increases'},
 {'company_name': 'NVIDIA Corporation',
  'ticker': 'NVDA',
  'predicted_change_label': 'increases'},
 {'company_name': 'Advanced Micro Devices, Inc.',
  'ticker': 'AMD',
  'predicted_change_label': 'increases'},
 {'company_name': 'Tesla, Inc.',
  'ticker': 'TSLA',
  'predicted_change_label': 'increases'},
 {'company_name': 'Dow Inc.',
  'ticker': 'DOW',
  'predicted_change_label': 'increases'},
 {'company_name': 'Danaher Corporation',
  'ticker': 'DHR',
  'predicted_ch

In [17]:
# prompt: save the model weights
torch.save(model.state_dict(), "model.pt")