# Problem Statement: **LLM for financial text understanding.**

Application of the project : This project is intended to understand financial text and answer questions related to finance and recommend stocks.



Project Description : The project considers short term and long term finantial data, news articles,stock prices to predict short and long term finantial understanding .The methodology diagram is given below.

![Methodology Diagram](meth_diag.jpg)


# 1: Import Packages and Install Required Libraries

```python
# Install necessary libraries
!pip install transformers torch pandas numpy scikit-learn

# Import statements
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.model_selection import train_test_split
```

## 2: Data Collection Short term - Sample SBI Bank 


### 2.1 preliminary Preprocessing

In [None]:
import pandas as pd

# Load the news and stock prices data
news_file_path = 'Nifty50_news_data.csv'
stock_prices_file_path = 'SBI_BANK.csv'

df = pd.read_csv(news_file_path)

# Define patterns to filter articles related to SBI
patterns = [
    r'\bSBI Bank\b',
    r'\bState Bank of India\b',
    r'\bSBI\b',
    r'\bSBI\s+Bank\b'
]
combined_pattern = '|'.join(patterns)

# Filter out only the articles related to SBI
df = df[
    df['headline'].str.contains(combined_pattern, case=False, regex=True, na=False) |
    df['description'].str.contains(combined_pattern, case=False, regex=True, na=False) |
    df['articleBody'].str.contains(combined_pattern, case=False, regex=True, na=False)
]

# Load stock prices data
stock_prices_df = pd.read_csv(stock_prices_file_path)

# Format the 'datePublished' to 'YYYY-MM-DD'
df['Formatted Date'] = pd.to_datetime(df['datePublished'], errors='coerce').dt.strftime('%Y-%m-%d')

# Format the stock prices' Date to 'YYYY-MM-DD'
stock_prices_df['Date'] = pd.to_datetime(stock_prices_df['Date'], format='%d-%m-%Y')

# Initialize lists to hold the final dataset and volatilities
final_data = []
all_volatilities = []

# Iterate over each filtered article
for index, article in df.iterrows():
    article_date = pd.to_datetime(article['Formatted Date'], errors='coerce')
    
    # Get the stock prices for the day of the article and the next 7 trading days
    stock_subset = stock_prices_df[stock_prices_df['Date'] >= article_date].head(8)
    
    # If there are not enough days in the stock data, skip this article
    if len(stock_subset) < 8:
        continue
    
    # Extract the closing prices
    closing_prices = stock_subset['Close'].values
    
    # Calculate the volatility (standard deviation) of the 7-day stock prices
    volatility = closing_prices.std()
    all_volatilities.append(volatility)
    
    # Calculate the overall change from day t to day t+7
    overall_change = closing_prices[-1] - closing_prices[0]
    
    # Generate day-by-day comparison text
    comparison_text = []
    for i in range(7):
        if closing_prices[i + 1] > closing_prices[i]:
            comparison_text.append(f"increases from day {i+1} to day {i+2}")
        elif closing_prices[i + 1] < closing_prices[i]:
            comparison_text.append(f"decreases from day {i+1} to day {i+2}")
        else:
            comparison_text.append(f"remains almost the same from day {i+1} to day {i+2}")
    
    # Create a new row for the final dataset
    row = {
        'Article Date': article_date.strftime('%Y-%m-%d'),
        'headline': article['headline'],
        'Description': article['description'],
        'articleBody': article['articleBody'],
        'Comparison Text': ', '.join(comparison_text) + '.',
        'Overall Change (Day t to Day t+7)': overall_change,
        'Volatility': volatility
    }
    
    # Add the stock prices for each day
    for i in range(len(stock_subset)):
        row[f'Stock Day {i}'] = closing_prices[i]
    
    final_data.append(row)

# Convert the final dataset list to a DataFrame
final_df = pd.DataFrame(final_data)

# Calculate mean volatility
mean_volatility = pd.Series(all_volatilities).mean()

# Define a function to classify volatility
def classify_volatility(volatility, mean_volatility):
    if volatility < 0.5 * mean_volatility:
        return 'Very Low'
    elif 0.5 * mean_volatility <= volatility < 0.75 * mean_volatility:
        return 'Low'
    elif 0.75 * mean_volatility <= volatility < 1.25 * mean_volatility:
        return 'Medium'
    elif 1.25 * mean_volatility <= volatility < 1.5 * mean_volatility:
        return 'High'
    else:
        return 'Very High'

# Apply classification to each row
for index, row in final_df.iterrows():
    # Classify volatility
    volatility_classification = classify_volatility(row['Volatility'], mean_volatility)
    final_df.loc[index, 'Volatility Text'] = f"Volatility is {volatility_classification}."
    
    # Add overall change description
    if row['Overall Change (Day t to Day t+7)'] > 0:
        overall_change_text = f"There is an increase in stock value by {row['Overall Change (Day t to Day t+7)']:.2f} units from day t to day t+7."
    elif row['Overall Change (Day t to Day t+7)'] < 0:
        overall_change_text = f"There is a decrease in stock value by {abs(row['Overall Change (Day t to Day t+7)']):.2f} units from day t to day t+7."
    else:
        overall_change_text = f"There is no change in stock value from day t to day t+7."
    
    final_df.loc[index, 'Overall Change Text'] = overall_change_text
    final_df.loc[index, 'LLM Text'] = f"{final_df.loc[index, 'Volatility Text']} {overall_change_text}"

# Save the final DataFrame to a CSV file
final_df.to_csv('Articles_with_Stock_Prices_and_LLM_Text.csv', index=False)

print("Final dataset saved successfully.")


### 2.2 Sentiment Analysis

In [None]:
import pandas as pd  
from transformers import pipeline  

df = pd.read_csv('/Articles_with_Stock_Prices_and_LLM_Text.csv')

# Initialize a sentiment analysis pipeline from the Hugging Face transformers library
# This pipeline will classify the sentiment of text into categories like 'POSITIVE', 'NEGATIVE', etc.
sentiment_analyzer = pipeline("sentiment-analysis")

# Function to categorize sentiment based on the score and label provided by the sentiment analyzer
def categorize_sentiment(score, label):
    """
    This function takes in the sentiment score and the label from the sentiment analysis output
    and returns a categorized sentiment in more specific terms.

    Parameters:
    - score: The confidence score of the sentiment label (between 0 and 1).
    - label: The sentiment label provided by the sentiment analyzer, like 'POSITIVE', 'NEGATIVE', etc.

    Output:
    - Returns a specific category of sentiment: 'highly positive', 'positive', 'highly negative', 'negative', or 'neutral'.
    """

    # Check if the label is 'POSITIVE' and categorize based on the score
    if label == 'POSITIVE':
        if score > 0.85:  # If the confidence score is greater than 0.85, mark it as 'highly positive'
            return "highly positive"
        return "positive"  # Otherwise, it's just 'positive'

    # Check if the label is 'NEGATIVE' and categorize based on the score
    elif label == 'NEGATIVE':
        if score > 0.85:  # If the confidence score is greater than 0.85, mark it as 'highly negative'
            return "highly negative"
        return "negative"  # Otherwise, it's just 'negative'

    # If the label is 'NA', return 'NA' (perhaps for missing or irrelevant text)
    elif label == 'NA':
        return "NA"

    # For any other labels (e.g., neutral), return 'neutral'
    return "neutral"


In [None]:
# Extract the 'Description' column from the DataFrame and convert it to a list of articles
# Each 'Description' contains the text of a financial news article
articles = df['Description'].tolist()

# Initialize an empty list to store the sentiment analysis results
sentiment_results = []

# Iterate over each article in the 'articles' list
for article in articles:
    try:
        # Perform sentiment analysis on the article using the pre-initialized sentiment analyzer
        result = sentiment_analyzer(article)
        
        # Append the sentiment analysis result (a list of dictionaries containing 'label' and 'score') to the sentiment_results list
        sentiment_results.append(result)
    
    # If any error occurs during the sentiment analysis (e.g., due to an empty article or other issues),
    # the program will jump to this exception handling block
    except Exception as e:
        # Print the article to identify which one caused the error
        print(f"Error analyzing article: {article}")
        
        # Append a default result with 'NA' as the label and a score of 1, indicating no valid sentiment analysis could be performed
        sentiment_results.append([{'label' : 'NA', 'score' : 1}])


In [None]:
# Create a new column 'Sentiment' in the DataFrame 'df'
# This column will store a descriptive text summarizing the sentiment of each news article

df['Sentiment'] = [
    # For each sentiment analysis result in the 'sentiment_results' list:
    # If the sentiment label is not 'NA', generate a sentence describing the overall sentiment
    "The overall sentiment of the news is " + categorize_sentiment(res[0]['score'], res[0]['label'])
    if res[0]['label'] != 'NA'  # Check if the label is not 'NA'
    else 'NA'  # If the label is 'NA', set the value to 'NA' (no sentiment could be determined)
    
    # Iterate through the 'sentiment_results' list (each item is a result from the sentiment analyzer)
    for res in sentiment_results
]

df.to_csv('Articles_with_Sentiment_Analysis.csv', index=False)

### 2.3 preprocessing for LLM input 

In [None]:
import pandas as pd

def process_csv(input_file_path, output_file_path):
    """
    Process a CSV file to create a formatted dataset with input and output columns 
    for fine-tuning an LLM model.

    Args:
        input_file_path (str): Path to the input CSV file.
        output_file_path (str): Path to save the processed CSV file.

    Returns:
        DataFrame: Processed DataFrame with 'input' and 'output' columns.
    """
    # Read the CSV file
    df = pd.read_csv(input_file_path)

    # Create the 'input' column by combining headline, description, and articleBody with a prompt
    df['input'] = (
        "This is a news article related to a firm I'm considering for investment.\n\n" +
        df['headline'].fillna('') + " " +
        df['Description'].fillna('') + " " +
        df['articleBody'].fillna('') + "\n\n" +
        "Can you please analyze it, predict the behavior of the stock, and suggest whether to buy or sell?"
    )

    # Define a function to process the LLM Text column into the 'output' format
    def process_llm_text(text):
        """
        Modify the LLM Text to create the 'output' column, adding context and recommendations.

        Args:
            text (str): Original LLM Text.

        Returns:
            str: Processed text with recommendations for buy/sell.
        """
        # Start with a standard prefix to provide context
        processed_text = "After carefully analyzing the provided information, my prediction is as follows:\n\n"

        # Replace time reference "from day t to day t+7" with "in the next 7 days"
        text = text.replace('from day t to day t+7', 'in the next 7 days')

        # Add recommendation based on stock trend
        if 'decrease' in text.lower():
            text += " Based on this prediction, I would suggest selling the stock as its price is expected to decrease."
        elif 'increase' in text.lower():
            text += " Based on this prediction, I would suggest buying the stock as its price is expected to increase."
        else:
            text += " The analysis does not indicate a strong buy or sell recommendation."

        return processed_text + text

    # Apply the function to the LLM Text column to generate the 'output' column
    df['output'] = df['LLM Text'].apply(process_llm_text)

    # Select only the required columns ('input' and 'output')
    result_df = df[['prompt', 'output']]

    # Save the result to a new CSV file
    result_df.to_csv(output_file_path, index=False)
    print(f"Processed dataset saved to {output_file_path}")

    return result_df

# Process the file and save to 'Formatted_news_input_LLM.csv'
df = process_csv('Articles_with_Stock_Prices_and_LLM_Text.csv', 'Short_term_LLM_input.csv')



## 3: Data Collection Long Term - Sample ASIANPAINT




### 3.1 code for data scraping

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_quarterly_results(url):
    # Send a GET request to the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the quarterly results table
    table = soup.find('table', class_='data-table responsive-text-nowrap')

    # Extract headers (every alternate quarter)
    headers = ['Metric']
    header_row = table.find('tr')
    for i, th in enumerate(header_row.find_all('th')[1:]):  # Skip the first header (empty)
        if i % 2 == 0:  # Every alternate quarter
            headers.append(th.text.strip())

    # Extract data
    data = []
    for row in table.find_all('tr')[1:]:  # Skip the header row
        cols = row.find_all('td')
        if cols:
            metric = cols[0].text.strip()
            row_data = [metric]
            for i, col in enumerate(cols[1:]):
                if i % 2 == 0:  # Every alternate quarter
                    row_data.append(col.text.strip())
            data.append(row_data)

    # Create a DataFrame
    df = pd.DataFrame(data, columns=headers)
    return df

name = 'ASIANPAINT'
# URL of the target page
url = 'https://www.screener.in/company/' + name + '/'

# Scrape the data
results_df = scrape_quarterly_results(url)

# Display the results
print(results_df)

# Optionally, save to CSV
results_df.to_csv(name + '_quarterly_results.csv', index=False)

### 3.2 Preprocessing

Combine the output for all the compaines to generate long_term_stock_data.csv

In [None]:
import pandas as pd
import yfinance as yf
from datetime import datetime
from dateutil.relativedelta import relativedelta
import time

def get_stock_symbol(company_name):
    """Convert company name to NSE stock symbol"""
    symbol = company_name.strip().replace('_quarterly_results', '') + '.NS'
    return symbol

def get_next_6_months_prices(symbol, start_date):
    """Get stock prices for the next 6 months from the given date"""
    try:
        prices = []
        
        for i in range(1, 7):  # Get next 6 months
            next_date = start_date + relativedelta(months=i)
            next_date = next_date.replace(day=1)
            end_date = next_date + relativedelta(days=5)
            
            stock = yf.Ticker(symbol)
            df = stock.history(start=next_date, end=end_date)
            
            if not df.empty:
                prices.append(df['Close'].iloc[0])
            else:
                prices.append(None)
            
            time.sleep(0.5)  # Pause to respect API rate limits
        
        return prices
    
    except Exception as e:
        print(f"Error fetching data for {symbol}: {str(e)}")
        return [None] * 6

def process_csv_data(input_file, output_file):
    """Process CSV input and create CSV output with stock prices"""
    try:
        # Read input CSV
        df = pd.read_csv(input_file)
        print("Available columns in the CSV:", df.columns.tolist())
        
        # Get unique company-date combinations
        unique_combinations = df[['Company', 'Date']].drop_duplicates()
        
        rows = []
        total_combinations = len(unique_combinations)
        
        for idx, row in unique_combinations.iterrows():
            company = row['Company']
            date_str = row['Date']
            
            print(f"Processing {idx + 1}/{total_combinations}: {company} for date {date_str}")
            
            try:
                base_date = pd.to_datetime(date_str)
            except Exception as e:
                print(f"Error parsing date '{date_str}' for company '{company}': {str(e)}")
                continue
            
            symbol = get_stock_symbol(company)
            prices = get_next_6_months_prices(symbol, base_date)
            
            new_row = {
                'Company': company,
                'Date': date_str
            }
            
            # Add only prices to the output
            for i, price in enumerate(prices, 1):
                new_row[f'Price_{i}'] = price
            
            rows.append(new_row)
        
        if not rows:
            raise ValueError("No data was processed successfully")
            
        output_df = pd.DataFrame(rows)
        output_df.to_csv(output_file, index=False)
        
        print(f"Successfully processed all {len(rows)} combinations and saved to {output_file}")
        return output_df
        
    except Exception as e:
        print(f"Error processing CSV: {str(e)}")
        return None

# Example usage
if __name__ == "__main__":
    input_file = "consolidated_nifty50_data.csv"  # Your input CSV file path
    output_file = "output_all_prices.csv"  # Your output CSV file path
    
    results = process_csv_data(input_file, output_file)
    
    if results is not None:
        print("\nFirst few rows of processed data:")
        print(results.head())

In [None]:
import pandas as pd
import numpy as np

def clean_numeric_string(value):
    """Convert string numbers to float, handling commas and special characters"""
    if isinstance(value, (int, float)):
        return float(value)
    elif isinstance(value, str):
        # Remove commas, spaces, and any currency symbols
        cleaned = value.replace(',', '').replace('₹', '').replace('Rs', '').strip()
        try:
            return float(cleaned)
        except ValueError:
            return np.nan
    return np.nan

def parse_quarter_date(date_str):
    """Convert dates like 'Dec-21' to datetime"""
    try:
        return pd.to_datetime(date_str, format='%b-%y')
    except:
        # Fallback for any variations in month name
        month_dict = {
            'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
            'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
            'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
        }
        try:
            month = date_str.split('-')[0].title()[:3]
            year = date_str.split('-')[1]
            year = '20' + year  # Convert '21' to '2021'
            month_num = month_dict[month]
            return pd.to_datetime(f"{year}-{month_num}-01")
        except:
            return pd.NaT

def generate_comparison_text(current_row, prev_row):
    """Generate comparison text for financial metrics between two specific quarters"""
    if prev_row is None:
        return "No previous quarter data available"

    changes = []

    # Dictionary of factors and their display names
    factors = {
        'Depreciation': 'Depreciation',
        'EPS in Rs': 'EPS',
        'Expenses +': 'Expenses',
        'Interest': 'Interest',
        'Net Profit +': 'Net Profit',
        'Profit before tax': 'Profit Before Tax'
    }

    for column, display_name in factors.items():
        if column in current_row and column in prev_row:
            current_value = clean_numeric_string(current_row[column])
            prev_value = clean_numeric_string(prev_row[column])

            if pd.notna(current_value) and pd.notna(prev_value) and prev_value != 0:
                pct_change = (current_value - prev_value) / prev_value

                if abs(pct_change) < 0.001:
                    description = f"{display_name} remained virtually unchanged."
                else:
                    times = 1 + pct_change
                    direction = "increased" if pct_change > 0 else "decreased"

                    if abs(times) >= 2:
                        description = f"{display_name} {direction} by {abs(times):.1f} times."
                    else:
                        pct_change_formatted = abs(pct_change) * 100
                        description = f"{display_name} {direction} by {pct_change_formatted:.1f}%."

                changes.append(description)

    return " ".join(changes) if changes else "No comparison available"

def analyze_quarterly_changes(df):
    """
    Analyze changes between consecutive quarters for each company

    Parameters:
    df: DataFrame with columns for company, quarter_date, and financial metrics
    """
    # Create a copy to avoid modifying the original dataframe
    df = df.copy()

    # Convert string numbers to float for all numeric columns
    numeric_columns = [
        'Depreciation', 'EPS in Rs', 'Expenses +', 'Interest',
        'Net Profit +', 'Profit before tax'
    ]

    for col in numeric_columns:
        if col in df.columns:
            df[col] = df[col].apply(clean_numeric_string)

    # Convert quarter_date strings to datetime
    df['Date'] = pd.to_datetime(df['Date'])

    # Sort by company and date
    df = df.sort_values(['Company', 'Date'])
    print(df.head())
    # Initialize results list
    results = []

    # Process each company separately
    for company in df['Company'].unique():
        company_data = df[df['Company'] == company].copy()
        company_data = company_data.sort_values('Date')
        # print(company_data.head())
        # Process each quarter
        prev_row = None
        for idx, row in company_data.iterrows():
            if prev_row is not None:

              comparison_text = generate_comparison_text(row, prev_row)

              results.append({
                  'Company': company,
                  'Date': row['Date'],
                  'quarterly_comparison': comparison_text
              })

            prev_row = row

    # Create result DataFrame
    result_df = pd.DataFrame(results)

    return result_df


### 3.3 preprocessing for LLM input 

In [None]:
import pandas as pd

def process_long_term_data(input_file_path, output_file_path):
    """
    Process long-term stock data to create a formatted dataset for LLM fine-tuning.
    Includes volatility, overall change, and long-term investment recommendation.

    Args:
        input_file_path (str): Path to the input CSV file.
        output_file_path (str): Path to save the processed CSV file.

    Returns:
        DataFrame: Processed DataFrame with 'input' and 'output' columns.
    """
    # Read the CSV file
    df = pd.read_csv(input_file_path)

    # Calculate volatility (standard deviation) for each row
    df['Volatility'] = df[['Price_1', 'Price_2', 'Price_3', 'Price_4', 'Price_5', 'Price_6']].std(axis=1, skipna=True)

    # Calculate overall change (percentage change from Price_1 to the last available price)
    df['Overall Change'] = (
        (df[['Price_1', 'Price_2', 'Price_3', 'Price_4', 'Price_5', 'Price_6']].bfill(axis=1).iloc[:, -1] - df['Price_1']) 
        / df['Price_1'] * 100
    )

    # Determine volatility level based on overall dataset mean volatility
    mean_volatility = df['Volatility'].mean()
    
    def classify_volatility(volatility):
        if volatility < 0.5 * mean_volatility:
            return 'Very Low'
        elif 0.5 * mean_volatility <= volatility < 0.75 * mean_volatility:
            return 'Low'
        elif 0.75 * mean_volatility <= volatility < 1.25 * mean_volatility:
            return 'Medium'
        elif 1.25 * mean_volatility <= volatility < 1.5 * mean_volatility:
            return 'High'
        else:
            return 'Very High'

    df['Volatility Level'] = df['Volatility'].apply(classify_volatility)

    # Create the 'input' column for LLM fine-tuning
    df['input'] = (
        "This is a quarterly financial report of the company, I'm considering for a long-term investment.\n\n" +
        df['Company'] + "\n" +
        "Date of Report: " + df['Date'] + "\n" +
        "Summary of Quarterly Changes:\n" + df['quarterly_comparison'] + "\n\n" +
        "Please analyze this report and predict the behavior of the stock over the next six months, "
        "providing a recommendation to buy or sell."
    )

    # Generate the 'output' column for LLM based on the price trend and volatility
    def generate_output(row):
        # Starting text for the LLM output
        output_text = "After analyzing the quarterly report and six-month stock data, my observations are as follows:\n\n"

        # Determine the overall trend
        if row['Overall Change'] > 0:
            trend_text = f"The stock has shown an overall increase of {row['Overall Change']:.2f}% over the last six months."
            recommendation = "Based on this, I would suggest buying the stock as it shows potential for growth."
        elif row['Overall Change'] < 0:
            trend_text = f"The stock has shown an overall decrease of {abs(row['Overall Change']):.2f}% over the last six months."
            recommendation = "Based on this, I would suggest selling the stock as it shows signs of decline."
        else:
            trend_text = "The stock has remained relatively stable over the last six months."
            recommendation = "It may be advisable to hold off on buying or selling based on the stability of the stock."

        # Volatility description
        volatility_text = f"The stock's volatility over this period is categorized as {row['Volatility Level']}."

        # Combine everything into the output
        return output_text + trend_text + "\n" + volatility_text + "\n\n" + recommendation

    # Apply the function to generate the 'output' column
    df['output'] = df.apply(generate_output, axis=1)

    # Select only the required columns
    result_df = df[['input', 'output']]

    # Save the result to a new CSV file
    result_df.to_csv(output_file_path, index=False)
    print(f"Processed long-term dataset saved to {output_file_path}")

    return result_df

# Process the file and save to 'Formatted_long_term_LLM.csv'
df = process_long_term_data('long_term_stock_data.csv', 'long_term_LLM_input.csv')


## 4 Final input for Fine-Tuning
Combine both the llm inputs into a common main file named - Formatted_news_input_LLM.csv

## 5 LLM Fine-Tuning Code

In [None]:
#  Run on colab first to get the requirements.txt
# !pip freeze > requirements.txt

In [None]:
!pip install datasets peft

In [None]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType



In [None]:
model_name = "ChanceFocus/finma-7b-nlp"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

In [None]:
# Configure LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj"]
)


# Apply LoRA to the model
model = get_peft_model(model, peft_config)

# Load the CSV file
df = pd.read_csv("Formatted_news_input_LLM.csv")

# Create a dataset from the DataFrame
dataset = Dataset.from_pandas(df)



In [None]:
# Preprocess function without `legacy=False`
def preprocess_function(examples):
    inputs = [f"Prompt: {prompt}\nOutput: {output}" for prompt, output in zip(examples["prompt"], examples["output"])]
    
    # Tokenize with truncation and padding
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    
    # Replace out-of-vocabulary tokens with the unknown token ID (for any IDs exceeding vocab size)
    max_vocab_size = tokenizer.vocab_size
    model_inputs["input_ids"] = [
        [token_id if token_id < max_vocab_size else tokenizer.unk_token_id for token_id in ids]
        for ids in model_inputs["input_ids"]
    ]
    
    # Copy input IDs to labels for training
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

# Proceed with the remaining code
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)


In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./finma-7b-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,
    logging_steps=100,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,
)



In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)



In [None]:
# Start training
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./finma-7b-finetuned")
tokenizer.save_pretrained("./finma-7b-finetuned")


## 5 Inferencing and Testing

## 5 Metrics

In [None]:
import pandas as pd

# Function to calculate expected value based on Overall Change and Stock Day 0
def calculate_expected_value(overall_change, stock_day_0):
    # Calculate the ratio
    ratio = abs(overall_change / stock_day_0)

    # Determine expected value
    if ratio > 0.001:
        return "B" if overall_change > 0 else "S"
    else:
        return "H"

# Function to calculate accuracy
def calculate_accuracy(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    correct = 0
    total = 0
    # Compute the expected value for each row
    expected_values = []
    is_correct = []
    for index, row in df.iterrows():
        overall_change = row['Overall Change (Day t to Day t+7)']
        stock_day_0 = row['Stock Day 0']
        expected_value = calculate_expected_value(overall_change, stock_day_0)
        # print(index)
        # print(expected_value)
        # print(row['actual_output'][0])
        total+=1
        if expected_value==row['actual_output'][0]:
          is_correct.append(True)
          correct+=1
        else:
          is_correct.append(False)
        expected_values.append(expected_value)
    new_df = pd.DataFrame()
    # Add expected values to the DataFrame
    new_df['expected_output'] = expected_values

    # Calculate accuracy by comparing expected and actual outputs
    new_df['is_correct'] = is_correct
    new_df['actual_output'] = df['actual_output']
    accuracy = (correct/total) * 100 # Convert to percentage

    # Print accuracy
    print(f"Accuracy: {accuracy:.2f}%")

    # Optionally, save the DataFrame with expected and actual outputs for review
    new_df.to_csv("output_with_expected_actual.csv", index=False)

    return accuracy

# Usage example
file_path = 'output_for_metric.csv'  # Path to the file with actual_output and other columns
accuracy = calculate_accuracy(file_path)
