# GR5398 26Spring: FinGPT Large Language Model Track
## Assignment 1

### 1. Data Preparation

In this part, you can build your own dataset for later training. Here is the example of how we generate dataset from Dow Jones 30's component stocks. 

We used **Finnhub** to get raw data, and **GPT-4** to generate benchmark responses (you can change these to get a better result, maybe).

For detailed information of this part, please refer to [`prepare_data.ipynb`](https://github.com/AI4Finance-Foundation/FinGPT/blob/master/fingpt/FinGPT_Forecaster/prepare_data.ipynb)

In [3]:
!pip install finnhub-python

Collecting finnhub-python
  Downloading finnhub_python-2.4.27-py3-none-any.whl.metadata (9.2 kB)
Downloading finnhub_python-2.4.27-py3-none-any.whl (12 kB)
Installing collected packages: finnhub-python
Successfully installed finnhub-python-2.4.27


In [1]:
import torch
print(torch.cuda.get_device_name())

: 

In [4]:
import os
import re
import csv
import math
import time
import json
import random
import finnhub
import datasets
import pandas as pd
import yfinance as yf
from datetime import datetime
from collections import defaultdict
from datasets import Dataset
from openai import OpenAI

: 

In [None]:
START_DATE = "2024-12-31" ### Change to desired start date
END_DATE = "2025-05-31" ### Change to desired end date

DATA_DIR = f"./{START_DATE}_{END_DATE}"
os.makedirs(DATA_DIR, exist_ok=True)

finnhub_client = finnhub.Client(api_key="d60aqq9r01qgk0vihtj0d60aqq9r01qgk0vihtjg")
client = OpenAI(api_key = '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')

: 

In [21]:
import os

print("cwd =", os.getcwd())
print("DATA_DIR exists?", os.path.exists(DATA_DIR))
print("files in DATA_DIR:", os.listdir(DATA_DIR))

cwd = /content
DATA_DIR exists? True
files in DATA_DIR: ['GS_2024-12-31_2025-05-31.csv', 'HD_2024-12-31_2025-05-31.csv', 'CSCO_2024-12-31_2025-05-31.csv', 'CAT_2024-12-31_2025-05-31.csv', 'BA_2024-12-31_2025-05-31.csv', 'AAPL_2024-12-31_2025-05-31.csv', 'AXP_2024-12-31_2025-05-31.csv', 'CVX_2024-12-31_2025-05-31.csv', 'AMGN_2024-12-31_2025-05-31.csv']


: 

+ Raw Financial Data Acquisition (News and Returns)

In [12]:
def bin_mapping(ret):
    
    up_down = 'U' if ret >= 0 else 'D'
    integer = math.ceil(abs(100 * ret))
    
    return up_down + (str(integer) if integer <= 5 else '5+')


# def get_returns(stock_symbol):
    
#     # Download historical stock data
#     stock_data = yf.download(stock_symbol, start=START_DATE, end=END_DATE)

#     print(type(stock_data.columns), stock_data.columns)
#     print(stock_data.head())

#     price_col = 'Adj Close' if 'Adj Close' in stock_data.columns else 'Close'
#     weekly_data = stock_data[price_col].resample('W').ffill()
    
#     weekly_returns = weekly_data.pct_change()[1:]
#     weekly_start_prices = weekly_data[:-1]
#     weekly_end_prices = weekly_data[1:]

#     weekly_data = pd.DataFrame({
#         'Start Date': weekly_start_prices.index,
#         'Start Price': weekly_start_prices.values,
#         'End Date': weekly_end_prices.index,
#         'End Price': weekly_end_prices.values,
#         'Weekly Returns': weekly_returns.values
#     })
    
#     weekly_data['Bin Label'] = weekly_data['Weekly Returns'].map(bin_mapping)

#     return weekly_data

def get_returns(stock_symbol):
    stock_data = yf.download(stock_symbol, start=START_DATE, end=END_DATE)

    # 兼容 MultiIndex columns: ('Close','AXP') 这种
    if isinstance(stock_data.columns, pd.MultiIndex):
        price = stock_data[('Close', stock_symbol)]
    else:
        price = stock_data['Adj Close'] if 'Adj Close' in stock_data.columns else stock_data['Close']

    price = price.squeeze()  # 确保是一维 Series

    weekly_prices = price.resample('W').ffill()
    weekly_returns = weekly_prices.pct_change().iloc[1:]

    weekly_start_prices = weekly_prices.shift(1).iloc[1:]
    weekly_end_prices = weekly_prices.iloc[1:]

    weekly_data = pd.DataFrame({
        'Start Date': weekly_start_prices.index,
        'Start Price': weekly_start_prices.to_numpy(),
        'End Date': weekly_end_prices.index,
        'End Price': weekly_end_prices.to_numpy(),
        'Weekly Returns': weekly_returns.to_numpy()
    })

    weekly_data['Bin Label'] = weekly_data['Weekly Returns'].map(bin_mapping)
    return weekly_data


def get_news(symbol, data):
    
    news_list = []
    
    for end_date, row in data.iterrows():
        start_date = row['Start Date'].strftime('%Y-%m-%d')
        end_date = row['End Date'].strftime('%Y-%m-%d')
        print(symbol, ': ', start_date, ' - ', end_date)
        time.sleep(1) # control qpm
        weekly_news = finnhub_client.company_news(symbol, _from=start_date, to=end_date)
        weekly_news = [
            {
                "date": datetime.fromtimestamp(n['datetime']).strftime('%Y%m%d%H%M%S'),
                "headline": n['headline'],
                "summary": n['summary'],
            } for n in weekly_news
        ]
        weekly_news.sort(key=lambda x: x['date'])
        news_list.append(json.dumps(weekly_news))
    
    data['News'] = news_list
    
    return data


def get_basics(symbol, data, always=False):
    
    basic_financials = finnhub_client.company_basic_financials(symbol, 'all')
    
    final_basics, basic_list, basic_dict = [], [], defaultdict(dict)
    
    for metric, value_list in basic_financials['series']['quarterly'].items():
        for value in value_list:
            basic_dict[value['period']].update({metric: value['v']})

    for k, v in basic_dict.items():
        v.update({'period': k})
        basic_list.append(v)
        
    basic_list.sort(key=lambda x: x['period'])
            
    for i, row in data.iterrows():
        
        start_date = row['End Date'].strftime('%Y-%m-%d')
        last_start_date = START_DATE if i < 2 else data.loc[i-2, 'Start Date'].strftime('%Y-%m-%d')
        
        used_basic = {}
        for basic in basic_list[::-1]:
            if (always and basic['period'] < start_date) or (last_start_date <= basic['period'] < start_date):
                used_basic = basic
                break
        final_basics.append(json.dumps(used_basic))
        
    data['Basics'] = final_basics
    
    return data
    

def prepare_data_for_company(symbol, with_basics=True):
    
    data = get_returns(symbol)
    data = get_news(symbol, data)
    
    if with_basics:
        data = get_basics(symbol, data)
        data.to_csv(f"{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}.csv")
    else:
        data['Basics'] = [json.dumps({})] * len(data)
        data.to_csv(f"{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}_nobasics.csv")
    
    return data

: 

In [14]:
# DOW_30 = [
#     "AXP", "AMGN", "AAPL", "BA", "CAT", "CSCO", "CVX", "GS", "HD", "HON",
#     "IBM", "INTC", "JNJ", "KO", "JPM", "MCD", "MMM", "MRK", "MSFT", "NKE",
#     "PG", "TRV", "UNH", "CRM", "VZ", "V", "WBA", "WMT", "DIS", "DOW"
# ] ### Or you can define your own list of symbols
DOW_30 = [
    "AXP", "AMGN", "AAPL"#, "BA", "CAT", "CSCO", "CVX", "GS", "HD", "HON",
    # "IBM", "INTC", "JNJ", "KO", "JPM", "MCD", "MMM", "MRK", "MSFT", "NKE",
    # "PG", "TRV", "UNH", "CRM", "VZ", "V", "WBA", "WMT", "DIS", "DOW"
] ### Or you can define your own list of symbols

for symbol in DOW_30:
    prepare_data_for_company(symbol)

  stock_data = yf.download(stock_symbol, start=START_DATE, end=END_DATE)
[*********************100%***********************]  1 of 1 completed


AXP :  2025-01-12  -  2025-01-12
AXP :  2025-01-19  -  2025-01-19
AXP :  2025-01-26  -  2025-01-26
AXP :  2025-02-02  -  2025-02-02
AXP :  2025-02-09  -  2025-02-09
AXP :  2025-02-16  -  2025-02-16
AXP :  2025-02-23  -  2025-02-23
AXP :  2025-03-02  -  2025-03-02
AXP :  2025-03-09  -  2025-03-09
AXP :  2025-03-16  -  2025-03-16
AXP :  2025-03-23  -  2025-03-23
AXP :  2025-03-30  -  2025-03-30
AXP :  2025-04-06  -  2025-04-06
AXP :  2025-04-13  -  2025-04-13
AXP :  2025-04-20  -  2025-04-20
AXP :  2025-04-27  -  2025-04-27
AXP :  2025-05-04  -  2025-05-04
AXP :  2025-05-11  -  2025-05-11
AXP :  2025-05-18  -  2025-05-18
AXP :  2025-05-25  -  2025-05-25
AXP :  2025-06-01  -  2025-06-01


  stock_data = yf.download(stock_symbol, start=START_DATE, end=END_DATE)
[*********************100%***********************]  1 of 1 completed


AMGN :  2025-01-12  -  2025-01-12
AMGN :  2025-01-19  -  2025-01-19
AMGN :  2025-01-26  -  2025-01-26
AMGN :  2025-02-02  -  2025-02-02
AMGN :  2025-02-09  -  2025-02-09
AMGN :  2025-02-16  -  2025-02-16
AMGN :  2025-02-23  -  2025-02-23
AMGN :  2025-03-02  -  2025-03-02
AMGN :  2025-03-09  -  2025-03-09
AMGN :  2025-03-16  -  2025-03-16
AMGN :  2025-03-23  -  2025-03-23
AMGN :  2025-03-30  -  2025-03-30
AMGN :  2025-04-06  -  2025-04-06
AMGN :  2025-04-13  -  2025-04-13
AMGN :  2025-04-20  -  2025-04-20
AMGN :  2025-04-27  -  2025-04-27
AMGN :  2025-05-04  -  2025-05-04
AMGN :  2025-05-11  -  2025-05-11
AMGN :  2025-05-18  -  2025-05-18
AMGN :  2025-05-25  -  2025-05-25
AMGN :  2025-06-01  -  2025-06-01


  stock_data = yf.download(stock_symbol, start=START_DATE, end=END_DATE)
[*********************100%***********************]  1 of 1 completed


AAPL :  2025-01-12  -  2025-01-12
AAPL :  2025-01-19  -  2025-01-19
AAPL :  2025-01-26  -  2025-01-26
AAPL :  2025-02-02  -  2025-02-02
AAPL :  2025-02-09  -  2025-02-09
AAPL :  2025-02-16  -  2025-02-16
AAPL :  2025-02-23  -  2025-02-23
AAPL :  2025-03-02  -  2025-03-02
AAPL :  2025-03-09  -  2025-03-09
AAPL :  2025-03-16  -  2025-03-16
AAPL :  2025-03-23  -  2025-03-23
AAPL :  2025-03-30  -  2025-03-30
AAPL :  2025-04-06  -  2025-04-06
AAPL :  2025-04-13  -  2025-04-13
AAPL :  2025-04-20  -  2025-04-20
AAPL :  2025-04-27  -  2025-04-27
AAPL :  2025-05-04  -  2025-05-04
AAPL :  2025-05-11  -  2025-05-11
AAPL :  2025-05-18  -  2025-05-18
AAPL :  2025-05-25  -  2025-05-25
AAPL :  2025-06-01  -  2025-06-01


: 

+ Generate Prompt from Financial Data

In [15]:
def get_company_prompt(symbol):
    
    profile = finnhub_client.company_profile2(symbol=symbol)

    company_template = "[Company Introduction]:\n\n{name} is a leading entity in the {finnhubIndustry} sector. Incorporated and publicly traded since {ipo}, the company has established its reputation as one of the key players in the market. As of today, {name} has a market capitalization of {marketCapitalization:.2f} in {currency}, with {shareOutstanding:.2f} shares outstanding." \
        "\n\n{name} operates primarily in the {country}, trading under the ticker {ticker} on the {exchange}. As a dominant force in the {finnhubIndustry} space, the company continues to innovate and drive progress within the industry."

    formatted_str = company_template.format(**profile)
    
    return formatted_str


def get_prompt_by_row(symbol, row):

    start_date = row['Start Date'] if isinstance(row['Start Date'], str) else row['Start Date'].strftime('%Y-%m-%d')
    end_date = row['End Date'] if isinstance(row['End Date'], str) else row['End Date'].strftime('%Y-%m-%d')
    term = 'increased' if row['End Price'] > row['Start Price'] else 'decreased'
    head = "From {} to {}, {}'s stock price {} from {:.2f} to {:.2f}. Company news during this period are listed below:\n\n".format(
        start_date, end_date, symbol, term, row['Start Price'], row['End Price'])
    
    news = json.loads(row["News"])
    news = ["[Headline]: {}\n[Summary]: {}\n".format(
        n['headline'], n['summary']) for n in news if n['date'][:8] <= end_date.replace('-', '') and \
        not n['summary'].startswith("Looking for stock market analysis and research with proves results?")]

    basics = json.loads(row['Basics'])
    if basics:
        basics = "Some recent basic financials of {}, reported at {}, are presented below:\n\n[Basic Financials]:\n\n".format(
            symbol, basics['period']) + "\n".join(f"{k}: {v}" for k, v in basics.items() if k != 'period')
    else:
        basics = "[Basic Financials]:\n\nNo basic financial reported."
    
    return head, news, basics


def sample_news(news, k=5):
    
    return [news[i] for i in sorted(random.sample(range(len(news)), k))]


def map_bin_label(bin_lb):
    
    lb = bin_lb.replace('U', 'up by ')
    lb = lb.replace('D', 'down by ')
    lb = lb.replace('1', '0-1%')
    lb = lb.replace('2', '1-2%')
    lb = lb.replace('3', '2-3%')
    lb = lb.replace('4', '3-4%')
    if lb.endswith('+'):
        lb = lb.replace('5+', 'more than 5%')
        # lb = lb.replace('5+', '5+%')
    else:
        lb = lb.replace('5', '4-5%')
    
    return lb


def get_all_prompts(symbol, min_past_weeks=1, max_past_weeks=3, with_basics=True):

    
    if with_basics:
        df = pd.read_csv(f'{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}.csv')
    else:
        df = pd.read_csv(f'{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}_nobasics.csv')
    
    company_prompt = get_company_prompt(symbol)

    prev_rows = []
    all_prompts = []

    for row_idx, row in df.iterrows():

        prompt = ""
        if len(prev_rows) >= min_past_weeks:
            idx = min(random.choice(range(min_past_weeks, max_past_weeks+1)), len(prev_rows))
            for i in range(-idx, 0):
                # Add Price Movement (Head)
                prompt += "\n" + prev_rows[i][0]
                # Add News of previous weeks
                sampled_news = sample_news(
                    prev_rows[i][1],
                    min(5, len(prev_rows[i][1]))
                )
                if sampled_news:
                    prompt += "\n".join(sampled_news)
                else:
                    prompt += "No relative news reported."

        head, news, basics = get_prompt_by_row(symbol, row)

        prev_rows.append((head, news, basics))
        if len(prev_rows) > max_past_weeks:
            prev_rows.pop(0)  

        if not prompt:
            continue

        prediction = map_bin_label(row['Bin Label'])
        
        prompt = company_prompt + '\n' + prompt + '\n' + basics
        prompt += f"\n\nBased on all the information before {row['Start Date']}, let's first analyze the positive developments and potential concerns for {symbol}. Come up with 2-4 most important factors respectively and keep them concise. Most factors should be inferred from company related news. " \
            f"Then let's assume your prediction for next week ({row['Start Date']} to {row['End Date']}) is {prediction}. Provide a summary analysis to support your prediction. The prediction result need to be inferred from your analysis at the end, and thus not appearing as a foundational factor of your analysis."

        all_prompts.append(prompt.strip())
    
    return all_prompts

: 

In [22]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"


SYSTEM_PROMPT = "You are a seasoned stock market analyst. Your task is to list the positive developments and potential concerns for companies based on relevant news and basic financials from the past weeks, then provide an analysis and prediction for the companies' stock price movement for the upcoming week. " \
    "Your answer format should be as follows:\n\n[Positive Developments]:\n1. ...\n\n[Potential Concerns]:\n1. ...\n\n[Prediction & Analysis]:\n...\n"

print(SYSTEM_PROMPT)

# prompts = get_all_prompts("AAPL", 1, 3)
# prompts = get_all_prompts("MSFT", 1, 3, False)
prompts = get_all_prompts("AAPL", min_past_weeks=1, max_past_weeks=4)

print(prompts[0])

You are a seasoned stock market analyst. Your task is to list the positive developments and potential concerns for companies based on relevant news and basic financials from the past weeks, then provide an analysis and prediction for the companies' stock price movement for the upcoming week. Your answer format should be as follows:

[Positive Developments]:
1. ...

[Potential Concerns]:
1. ...

[Prediction & Analysis]:
...

[Company Introduction]:

Apple Inc is a leading entity in the Technology sector. Incorporated and publicly traded since 1980-12-12, the company has established its reputation as one of the key players in the market. As of today, Apple Inc has a market capitalization of 3813817.94 in USD, with 14776.35 shares outstanding.

Apple Inc operates primarily in the US, trading under the ticker AAPL on the NASDAQ NMS - GLOBAL MARKET. As a dominant force in the Technology space, the company continues to innovate and drive progress within the industry.

From 2025-01-12 to 2025

: 

In [23]:
def append_to_csv(filename, input_data, output_data):
    
    with open(filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([input_data, output_data])

        
def initialize_csv(filename):
    
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["prompt", "answer"])


def query_gpt4(symbol_list, min_past_weeks=1, max_past_weeks=3, with_basics=True):

    for symbol in symbol_list:
        
        csv_file = f'{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}_gpt-4.csv' if with_basics else \
                   f'{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}_nobasics_gpt-4.csv'
        
        if not os.path.exists(csv_file):
            initialize_csv(csv_file)
            pre_done = 0
        else:
            df = pd.read_csv(csv_file)
            pre_done = len(df)

        prompts = get_all_prompts(symbol, min_past_weeks, max_past_weeks, with_basics)

        for i, prompt in enumerate(prompts):
            
            if i < pre_done:
                continue

            print(f"{symbol} - {i}")
            
            cnt = 0
            while cnt < 5:
                try:
                    completion = client.chat.completions.create(
                        model="gpt-4",
                        messages=[
                            {"role": "system", "content": SYSTEM_PROMPT},
                            {"role": "user", "content": prompt}
                          ]
                    )
                    break    
                except Exception:
                    cnt += 1
                    print(f'retry cnt {cnt}')
            
            answer = completion.choices[0].message.content if cnt < 5 else ""
            append_to_csv(csv_file, prompt, answer)

: 

In [24]:
query_gpt4(DOW_30, min_past_weeks=1, max_past_weeks=4)

AXP - 0
AXP - 1
AXP - 2
AXP - 3
AXP - 4
AXP - 5
AXP - 6
AXP - 7
AXP - 8
AXP - 9
AXP - 10
AXP - 11
AXP - 12
AXP - 13
AXP - 14
AXP - 15
AXP - 16
AXP - 17
AXP - 18
AXP - 19
AMGN - 0
AMGN - 1
AMGN - 2
AMGN - 3
AMGN - 4
AMGN - 5
AMGN - 6
AMGN - 7
AMGN - 8
AMGN - 9
AMGN - 10
AMGN - 11
AMGN - 12
AMGN - 13
AMGN - 14
AMGN - 15
AMGN - 16
AMGN - 17
AMGN - 18
AMGN - 19
AAPL - 0
AAPL - 1
AAPL - 2
AAPL - 3
AAPL - 4
AAPL - 5
AAPL - 6
AAPL - 7
AAPL - 8
AAPL - 9
AAPL - 10
AAPL - 11
AAPL - 12
AAPL - 13
AAPL - 14
AAPL - 15
AAPL - 16
AAPL - 17
AAPL - 18
AAPL - 19


: 

+ Transform to Llama Training Format

In [None]:
def gpt4_to_llama(symbol, with_basics=True):
    
    csv_file = f'{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}_gpt-4.csv' if with_basics else \
                   f'{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}_nobasics_gpt-4.csv'
    
    df = pd.read_csv(csv_file)
    
    prompts, answers, periods, labels = [], [], [], []
    
    for i, row in df.iterrows():
        
        prompt, answer = row['prompt'], row['answer']
        
        res = re.search(r"Then let's assume your prediction for next week \((.*)\) is ((:?up|down) by .*%).", prompt)
        
        period, label = res.group(1), res.group(2)
#         label = label.replace('more than 5', '5+')
        
        prompt = re.sub(
            r"Then let's assume your prediction for next week \((.*)\) is (up|down) by ((:?.*)%). Provide a summary analysis to support your prediction. The prediction result need to be inferred from your analysis at the end, and thus not appearing as a foundational factor of your analysis.", 
            f"Then make your prediction of the {symbol} stock price movement for next week ({period}). Provide a summary analysis to support your prediction.",
            prompt
        )
        try:
            answer = re.sub(
                r"\[Prediction & Analysis\]:\s*",
                f"[Prediction & Analysis]:\nPrediction: {label.capitalize()}\nAnalysis: ",
                answer
            )
        except Exception:
            print(symbol, i)
            print(label)
            print(answer)
            continue
            
        new_system_prompt = SYSTEM_PROMPT.replace(':\n...', '\nPrediction: ...\nAnalysis: ...')
#         new_system_prompt = SYSTEM_PROMPT.replace(':\n...', '\nPrediction: {Up|Down} by {1-2|2-3|3-4|4-5|5+}%\nAnalysis: ...')
        
        prompt = B_INST + B_SYS + new_system_prompt + E_SYS + prompt + E_INST
        
        prompts.append(prompt)
        answers.append(answer)
        periods.append(period)
        labels.append(label)
        
    return {
        "prompt": prompts,
        "answer": answers,
        "period": periods,
        "label": labels,
    }


def create_dataset(symbol_list, train_ratio=0.8, with_basics=True):

    train_dataset_list = []
    test_dataset_list = []

    for symbol in symbol_list:

        data_dict = gpt4_to_llama(symbol, with_basics)
#         print(data_dict['prompt'][-1])
#         print(data_dict['answer'][-1])
        symbols = [symbol] * len(data_dict['label'])
        data_dict.update({"symbol": symbols})

        dataset = Dataset.from_dict(data_dict)
        train_size = round(train_ratio * len(dataset))

        train_dataset_list.append(dataset.select(range(train_size)))
        test_dataset_list.append(dataset.select(range(train_size, len(dataset))))

    train_dataset = datasets.concatenate_datasets(train_dataset_list)
    test_dataset = datasets.concatenate_datasets(test_dataset_list)

    dataset = datasets.DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })
    
    return dataset

: 

In [None]:
dow30_v3_dataset = create_dataset(DOW_30, train_ratio=0.9)

: 

+ Test-time Information Fetching

In [None]:
import yfinance as yf
import pandas as pd
from datetime import date, datetime, timedelta


def get_curday():
    
    return date.today().strftime("%Y-%m-%d")


def n_weeks_before(date_string, n):
    
    date = datetime.strptime(date_string, "%Y-%m-%d") - timedelta(days=7*n)
    
    return date.strftime("%Y-%m-%d")


def get_stock_data(stock_symbol, steps):

    stock_data = yf.download(stock_symbol, steps[0], steps[-1])
    
#     print(stock_data)
    
    dates, prices = [], []
    available_dates = stock_data.index.format()
    
    for date in steps[:-1]:
        for i in range(len(stock_data)):
            if available_dates[i] >= date:
                prices.append(stock_data['Close'][i])
                dates.append(datetime.strptime(available_dates[i], "%Y-%m-%d"))
                break

    dates.append(datetime.strptime(available_dates[-1], "%Y-%m-%d"))
    prices.append(stock_data['Close'][-1])
    
    return pd.DataFrame({
        "Start Date": dates[:-1], "End Date": dates[1:],
        "Start Price": prices[:-1], "End Price": prices[1:]
    })


def get_current_basics(symbol, curday):

    basic_financials = finnhub_client.company_basic_financials(symbol, 'all')
    
    final_basics, basic_list, basic_dict = [], [], defaultdict(dict)
    
    for metric, value_list in basic_financials['series']['quarterly'].items():
        for value in value_list:
            basic_dict[value['period']].update({metric: value['v']})

    for k, v in basic_dict.items():
        v.update({'period': k})
        basic_list.append(v)
        
    basic_list.sort(key=lambda x: x['period'])
    
    for basic in basic_list[::-1]:
        if basic['period'] <= curday:
            break
            
    return basic
    

def get_all_prompts_online(symbol, data, curday, with_basics=True):

    company_prompt = get_company_prompt(symbol)

    prev_rows = []

    for row_idx, row in data.iterrows():
        head, news, _ = get_prompt_by_row(symbol, row)
        prev_rows.append((head, news, None))
        
    prompt = ""
    for i in range(-len(prev_rows), 0):
        prompt += "\n" + prev_rows[i][0]
        sampled_news = sample_news(
            prev_rows[i][1],
            min(5, len(prev_rows[i][1]))
        )
        if sampled_news:
            prompt += "\n".join(sampled_news)
        else:
            prompt += "No relative news reported."
        
    period = "{} to {}".format(curday, n_weeks_before(curday, -1))
    
    if with_basics:
        basics = get_current_basics(symbol, curday)
        basics = "Some recent basic financials of {}, reported at {}, are presented below:\n\n[Basic Financials]:\n\n".format(
            symbol, basics['period']) + "\n".join(f"{k}: {v}" for k, v in basics.items() if k != 'period')
    else:
        basics = "[Basic Financials]:\n\nNo basic financial reported."

    info = company_prompt + '\n' + prompt + '\n' + basics
    prompt = info + f"\n\nBased on all the information before {curday}, let's first analyze the positive developments and potential concerns for {symbol}. Come up with 2-4 most important factors respectively and keep them concise. Most factors should be inferred from company related news. " \
        f"Then make your prediction of the {symbol} stock price movement for next week ({period}). Provide a summary analysis to support your prediction."
        
    return info, prompt

: 

In [17]:
ticker = "AAPL"
n_weeks = 2
curday = get_curday()
steps = [n_weeks_before(curday, n) for n in range(n_weeks + 1)][::-1]

data = get_stock_data(ticker, steps)

data = get_news(ticker, data)

data['Basics'] = [json.dumps({})] * len(data)

: 

In [None]:
info, prompt = get_all_prompts_online(ticker, data, curday, False)

### Have a look at the prompt
print(prompt) 

: 

In [18]:
# # fetch data from hugging face

# from datasets import load_dataset

# ds = load_dataset("FinGPT/fingpt-forecaster-dow30-202305-202405")

# ds

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md:   0%|          | 0.00/581 [00:00<?, ?B/s]

data/train-00000-of-00001-7c4c80aa07272d(…):   0%|          | 0.00/3.57M [00:00<?, ?B/s]

(…)-00000-of-00001-28531804b005ddc6.parquet:   0%|          | 0.00/925k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1230 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/300 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'answer', 'period', 'label', 'symbol'],
        num_rows: 1230
    })
    test: Dataset({
        features: ['prompt', 'answer', 'period', 'label', 'symbol'],
        num_rows: 300
    })
})

: 

### 2. Fine-tune LLM

This is the core part of fine-tuning, which needs **DeepSpeed** to help you manage your VRAM while training on GPU(s). Since you need a brand new subprocess to launch DeepSpeed, here we don't provide you with the code for fine-tuning. Instead, we highly suggest you to run `train.sh` on your own terminal.

In [None]:
import subprocess
import textwrap

### Adjust based on your setup
cmd = textwrap.dedent("""
export NCCL_IGNORE_DISABLED_P2P=1
export TRANSFORMERS_NO_ADVISORY_WARNINGS=1
export TOKENIZERS_PARALLELISM=0

ds \
    --include localhost:0 \
    train_lora.py \
    --run_name nasdaq-100-20231231-20241231 \
    --base_model llama2 \
    --dataset fingpt-forecaster-nasdaq-100-20231231-20241231-1-4-06 \
    --max_length 4096 \
    --batch_size 1 \
    --gradient_accumulation_steps 16 \
    --learning_rate 5e-5 \
    --num_epochs 1 \
    --log_interval 10 \
    --warmup_ratio 0.03 \
    --scheduler constant \
    --evaluation_strategy steps \
    --ds_config config.json
""")

subprocess.run(cmd, shell=True, executable="/bin/bash")

: 

### 3. Have a try on your own fine-tuned LLMs

In this part, you can have a try on your fine-tuned models by providing it with some inputs and see their responses. 

If you made it, congratulations!

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import re

: 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cache_dir = "./pretrained-models" ### Adjust based on your setup

: 

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    "your_base_model", ### Change to your base model
    trust_remote_code=True,
    device_map="auto",
    cache_dir=cache_dir,
    torch_dtype=torch.float16,   # optional if you have enough VRAM
)

tokenizer = AutoTokenizer.from_pretrained(
    'your_base_model', ### Change to your base model
    cache_dir=cache_dir,
)

model = PeftModel.from_pretrained(
    base_model, 
    'your_finetuned_model', ### Change to your fine-tuned model
    cache_dir=cache_dir, 
    # offload_folder="./offload2/",
    torch_dtype=torch.float16,
    # offload_buffers=True
)
model = model.eval()

: 

In [None]:
prompt = """ 
    Your prompt here
"""

: 

In [None]:
inputs = tokenizer(
    prompt, 
    return_tensors='pt', 
    max_length=4096, 
    padding=False, 
    truncation=True
)
inputs = {key: value.to(model.device) for key, value in inputs.items()}
        
res = model.generate(
    **inputs, max_length=4096, do_sample=True,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True
)
output = tokenizer.decode(res[0], skip_special_tokens=True)
answer = re.sub(r'.*\[/INST\]\s*', '', output, flags=re.DOTALL) # don't forget to import re
print(answer)

: 

### 4. Comparison

Now since you get both fine-tuned LLMs based on DeepSeek, Llama3 and Qwen, here we would like you to do some comparison on selected metrics, to see which of these 2 fine-tuned models performs best.

In [None]:
import os
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from datasets import load_from_disk
from datasets import Dataset
from sklearn.metrics import accuracy_score
from tqdm import tqdm
# from peft import PeftModel
from utils import *
import time
import json, pickle

os.environ["HUGGINGFACE_TOKEN"] = "your huggingface token"

: 

In [None]:
# Llama3
llama3_base_model = AutoModelForCausalLM.from_pretrained(
    'meta-llama/Llama-3.1-8B',
    trust_remote_code=True,
    device_map="auto",
    cache_dir=cache_dir,
    torch_dtype=torch.float16,
)

llama3_model = PeftModel.from_pretrained(
    llama3_base_model, 
    'your_finetuned_model', ### Change to your fine-tuned model
    cache_dir=cache_dir, 
    torch_dtype=torch.float16,
)
llama3_model = llama3_model.eval()

llama3_tokenizer = AutoTokenizer.from_pretrained(
    'meta-llama/Llama-3.1-8B',
    cache_dir=cache_dir,
)
llama3_tokenizer.padding_side = "right"
llama3_tokenizer.pad_token_id = llama3_tokenizer.eos_token_id

: 

In [None]:
# DeepSeek
deepseek_base_model = AutoModelForCausalLM.from_pretrained(
    'deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
    trust_remote_code=True,
    device_map="auto",
    cache_dir=cache_dir,
    torch_dtype=torch.float16,
)

deepseek_model = PeftModel.from_pretrained(
    deepseek_base_model, 
    'your_finetuned_model', ### Change to your fine-tuned model
    cache_dir=cache_dir, 
    torch_dtype=torch.float16,
)
deepseek_model = deepseek_model.eval()

deepseek_tokenizer = AutoTokenizer.from_pretrained(
    'deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
    cache_dir=cache_dir,
)
deepseek_tokenizer.padding_side = "right"
deepseek_tokenizer.pad_token_id = deepseek_tokenizer.eos_token_id

: 

In [None]:
# test_dataset = load_dataset("your_dataset_name")[0]["test"]

# 直接从 Hugging Face 加载 Dow Jones 30 数据集
from datasets import load_dataset

# 加载数据集（2023年5月 - 2024年5月的数据）
dataset = load_dataset("FinGPT/fingpt-forecaster-dow30-202305-202405")

# 获取测试集
test_dataset = dataset["test"]

print(f"测试集大小: {len(test_dataset)}")
print(f"数据集列名: {test_dataset.column_names}")

: 

In [None]:
def filter_by_ticker(test_dataset, ticker_code):

    filtered_data = []

    for row in test_dataset:
        prompt_content = row['prompt']

        ticker_symbol = re.search(r"ticker\s([A-Z]+)", prompt_content)

        if ticker_symbol and ticker_symbol.group(1) == ticker_code:
            filtered_data.append(row)

    filtered_dataset = Dataset.from_dict({key: [row[key] for row in filtered_data] for key in test_dataset.column_names})

    return filtered_dataset

def get_unique_ticker_symbols(test_dataset):

    ticker_symbols = set()

    for i in range(len(test_dataset)):
        prompt_content = test_dataset[i]['prompt']

        ticker_symbol = re.search(r"ticker\s([A-Z]+)", prompt_content)

        if ticker_symbol:
            ticker_symbols.add(ticker_symbol.group(1))

    return list(ticker_symbols)

def insert_guidance_after_intro(prompt):

    intro_marker = (
        "[INST]<<SYS>>\n"
        "You are a seasoned stock market analyst. Your task is to list the positive developments and "
        "potential concerns for companies based on relevant news and basic financials from the past weeks, "
        "then provide an analysis and prediction for the companies' stock price movement for the upcoming week."
    )
    guidance_start_marker = "Based on all the information before"
    guidance_end_marker = "Following these instructions, please come up with 2-4 most important positive factors"

    intro_pos = prompt.find(intro_marker)
    guidance_start_pos = prompt.find(guidance_start_marker)
    guidance_end_pos = prompt.find(guidance_end_marker)

    if intro_pos == -1 or guidance_start_pos == -1 or guidance_end_pos == -1:
        return prompt

    guidance_section = prompt[guidance_start_pos:guidance_end_pos].strip()

    new_prompt = (
        f"{prompt[:intro_pos + len(intro_marker)]}\n\n"
        f"{guidance_section}\n\n"
        f"{prompt[intro_pos + len(intro_marker):guidance_start_pos]}"
        f"{prompt[guidance_end_pos:]}"
    )

    return new_prompt


def apply_to_all_prompts_in_dataset(test_dataset):

    updated_dataset = test_dataset.map(lambda x: {"prompt": insert_guidance_after_intro(x["prompt"])})

    return updated_dataset

test_dataset = apply_to_all_prompts_in_dataset(test_dataset)

unique_symbols = set(test_dataset['symbol'])

def test_demo(model, tokenizer, prompt):

    inputs = tokenizer(
        prompt, return_tensors='pt',
        padding=False, max_length=8000
    )
    inputs = {key: value.to(model.device) for key, value in inputs.items()}

    start_time = time.time()
    res = model.generate(
        **inputs, max_length=4096, do_sample=True,
        eos_token_id=tokenizer.eos_token_id,
        use_cache=True
    )
    end_time = time.time()
    output = tokenizer.decode(res[0], skip_special_tokens=True)
    return output, end_time - start_time

def test_acc(test_dataset, modelname):
    answers_base, answers_fine_tuned, gts, times_base, times_fine_tuned = [], [], [], [], []
    if modelname == "llama3":
        base_model = llama3_base_model
        model = llama3_model
        tokenizer = llama3_tokenizer
    elif modelname == "deepseek":
        base_model = deepseek_base_model
        model = deepseek_model
        tokenizer = deepseek_tokenizer
    ### Add other models here
    elif modelname == "your_model_name":  # Add other models as needed
        base_model = your_base_model  # Define your base model
        model = your_finetuned_model  # Define your fine-tuned model
        tokenizer = your_tokenizer     # Define your tokenizer

    for i in tqdm(range(len(test_dataset)), desc="Processing test samples"):
        try:
            prompt = test_dataset[i]['prompt']
            gt = test_dataset[i]['answer']

            output_base, time_base = test_demo(base_model, tokenizer, prompt)
            answer_base = re.sub(r'.*\[/INST\]\s*', '', output_base, flags=re.DOTALL)

            output_fine_tuned, time_fine_tuned = test_demo(model, tokenizer, prompt)
            answer_fine_tuned = re.sub(r'.*\[/INST\]\s*', '', output_fine_tuned, flags=re.DOTALL)

            answers_base.append(answer_base)
            answers_fine_tuned.append(answer_fine_tuned)
            gts.append(gt)
            times_base.append(time_base)
            times_fine_tuned.append(time_fine_tuned)

        except Exception as e:
            print(f"Error processing sample {i}: {e}")
    return answers_base, answers_fine_tuned, gts, times_base, times_fine_tuned

: 

In [None]:
### Llama3 Result Evaluating

llama3_answers_base, llama3_answers_fine_tuned, llama3_gts, llama3_base_times, llama3_fine_tuned_times = test_acc(test_dataset, "llama3")
llama3_base_metrics = calc_metrics(llama3_answers_base, llama3_gts)
llama3_fine_tuned_metrics = calc_metrics(llama3_answers_fine_tuned, llama3_gts)

with open("./comparison_results/llama3_base_metrics.pkl", "wb") as f:
    pickle.dump(llama3_base_metrics, f)

with open("./comparison_results/llama3_fine_tuned_metrics.pkl", "wb") as f:
    pickle.dump(llama3_fine_tuned_metrics, f)

with open("./comparison_results/llama3_base_times.pkl", "wb") as f:
    pickle.dump(llama3_base_times, f)

with open("./comparison_results/llama3_fine_tuned_times.pkl", "wb") as f:
    pickle.dump(llama3_fine_tuned_times, f)

: 

In [None]:
### DeepSeek Result Evaluating

deepseek_answers_base, deepseek_answers_fine_tuned, deepseek_gts, deepseek_base_times, deepseek_fine_tuned_times = test_acc(test_dataset, "deepseek")
deepseek_base_metrics = calc_metrics(deepseek_answers_base, deepseek_gts)
deepseek_fine_tuned_metrics = calc_metrics(deepseek_answers_fine_tuned, deepseek_gts)

with open("./comparison_results/deepseek_base_metrics.pkl", "wb") as f:
    pickle.dump(deepseek_base_metrics, f)

with open("./comparison_results/deepseek_fine_tuned_metrics.pkl", "wb") as f:
    pickle.dump(deepseek_fine_tuned_metrics, f)

with open("./comparison_results/deepseek_base_times.pkl", "wb") as f:
    pickle.dump(deepseek_base_times, f)

with open("./comparison_results/deepseek_fine_tuned_times.pkl", "wb") as f:
    pickle.dump(deepseek_fine_tuned_times, f)

: 

In [None]:
### Comparing Llama3 and DeepSeek Results

comparison_matrics = calc_metrics(llama3_answers_fine_tuned, deepseek_answers_fine_tuned) ### Change based on your models

with open("./comparison_results/comparison_matrics.pkl", "wb") as f:
    pickle.dump(comparison_matrics, f)

: 