# Preview

This notebook contains a challenge that uses the financebench data set (https://huggingface.co/datasets/PatronusAI/financebench).<br>
**The general goal** is to get the best score on the benchmark evaluation (code below) :)

This is a chellenging and realistic Financial Analyst test set where a question (can be complex!) and a context (a financial document, can be quite long!) are provided, and you're goal is to engineer an LLM flow, using Jamba, that solves as many of the questions on the test. 

# Set up

## 1. Installs + imports

In [29]:
# %pip install pandas
# %pip install ai21
# %pip install ai21_tokenizer
# %pip install OpenAI
# %pip install python-docx
# %pip install -U "huggingface_hub[cli]"
# %pip install huggingface_hub


# Import necessary modules
from ai21 import AI21Client  # For AI21 client
from ai21.models.chat.chat_message import SystemMessage, UserMessage, AssistantMessage  # For chat message models
from ai21 import tokenizers
from ai21_tokenizer import Jamba1_5Tokenizer
from concurrent.futures import ThreadPoolExecutor, as_completed  # For concurrent execution
import pandas as pd  # For data manipulation
import time  # For measuring latency
import os  # For file operations
from docx import Document  # For reading docx content

# For NVIDIA NIM using API key
from openai import OpenAI 


In [30]:
from huggingface_hub import login

# You must login to HuggingFace in order to use the tokeziner in this notebook. Please follow these instructions:
# 1. Open a user in HuggingFace (or login)
# 2. Request access to Jamba-1.5 models at https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini & https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large
# 3. Create a token with read access - https://huggingface.co/settings/tokens
# Replace 'your_huggingface_token' with your actual token

login(token="YOUR_HUGGINGFACE_TOKEN")

### More info?
**AI21 SDK documentation:** https://github.com/AI21Labs/ai21-python?tab=readme-ov-file

## 2. Helper Functions and Variables

In [41]:
#Available models
MODEL_JAMBA_LARGE = "jamba-1.5-large"
MODEL_JAMBA_MINI = "jamba-1.5-mini"
MAX_INPUT_TOKENS = 150000
TOKENIZER_MODEL_PATH = "ai21labs/AI21-Jamba-1.5-Mini"


#Get a response from AI21 models, measure latency (Clue: heavily impacted by prompt size)
def generate_response(messages,
                      client,
                      model:str = MODEL_JAMBA_MINI,
                      max_tokens:int=1000,
                      temperature=0.7,
                      max_retries:int = 5,
                      retry_delay:int = 1):
    
    for attempt in range(max_retries):
        try:
            start_time = time.time()
            response = client.chat.completions.create(
                messages=messages,
                model=model,
                max_tokens=max_tokens,
                temperature=temperature,
            )
            end_time = time.time()
            latency = end_time - start_time

            return response.choices[0].message, latency
        except Exception as e:
            print(f"Attempt {attempt} Failed , Error generating response: {e}")
            time.sleep(retry_delay)

    # start_time = time.time()
    # response = client.chat.completions.create(
    #     messages=messages,
    #     model=model,
    #     max_tokens=max_tokens,
    #     temperature=temperature,
    # )
    # end_time = time.time()
    # latency = end_time - start_time

    return response.choices[0].message, latency

#Generate messages - YOU CAN USE THIS FUNCTION
def generate_messages(system_msg:str, user_msg:str, context:str, question:str):
    # create defualt messages
    messages = [
        SystemMessage(content=system_msg, role="system"),# Only use this if you want to set the system message, MUST BE FIRST
        UserMessage(content=user_msg.format(context=context, question=question), role="user")
    ]
    return messages

def tokenize_and_truncate(text:str,
                           max_tokens:int = MAX_INPUT_TOKENS,
                           model_path:str = TOKENIZER_MODEL_PATH):
    tokenizer = Jamba1_5Tokenizer(model_path=model_path)
    encoded = tokenizer.encode(text)
    return tokenizer.decode(encoded[:max_tokens]) if max_tokens < len(encoded) else text

def read_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
            #NOTE:We trunicate the file content 150K tokens,
            #     as answer resides in the first 150K tokens
            return tokenize_and_truncate(content)
    except FileNotFoundError:
        return None

def append_md_file_contents(df: pd.DataFrame, folder_path: str) -> pd.DataFrame:
    # Create new columns to store file contents
    df['md_format'] = None

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        source_document = row['source_document']
    
        # Construct file paths for each format
        md_path = os.path.join(folder_path, f"{source_document}.md")
        
        # Read and append the contents
        df.at[index, 'md_format'] = read_file(md_path)
        
    return df

## 3. Load the data

In [32]:
#load the data
fin_bench_df = pd.read_csv("/Users/orishapira/Desktop/NVIDIA-AI21-dev-day-challenge/financebench_question_answer_doc_dataset.csv")

#Add financial docs content from files to the DataFrame - **NOTE:in Markdown format only**
folder_path = '/Users/orishapira/Desktop/nvidia-dev-challenge/data'
fin_bench_df = append_md_file_contents(fin_bench_df, folder_path)

### 3.1. Data columns are
**1. question:** contains the question to be answered<br>
**2. gold_answer:** the true and correct answer to the question<br>
**3. source_document:** the document containing the answer<br>
**4. md_format:** the parsed relevant data from the .md version of the source_document<br>


In [33]:
# Take a look at the data
fin_bench_df.head(2)


Unnamed: 0,question,gold_answer,source_document,md_format
0,What is the FY2018 capital expenditure amount ...,$1577.00,3M_2018_10K,low\n\n**UNITED STATES**\n\n**SECURITIES AND E...
1,Assume that you are a public equities analyst....,$8.70,3M_2018_10K,low\n\n**UNITED STATES**\n\n**SECURITIES AND E...


### 3.2. Create clients and naive prompts

**You can choose to work with one of 3 options,<br>**
Pass the relevant client to generate_response method according to your preference:<br><br>
**1 AI21 Studio** - If you are using AI21 studio, pass the ai21_client<br><br>
**2. Jamba NIM** -  pass the nvidia client. API key can be achieved in https://build.nvidia.com/ai21labs/jamba-1_5-large-instruct?api_key=true when pressing "Get API key", on the top right corner of the Python snippet. Email is required to register to NVIDIA developers programs.<br><br>
**3 Jamba NIM  via Langchain** - in Langchain ChatNVIDIA module. https://python.langchain.com/docs/integrations/chat/nvidia_ai_endpoints/



In [37]:

#create the client and defualt messages
AI21_API_KEY = 'YOUR_AI21_API_KEY'
NVIDIA_NIM_API_KEY = "$<YOUR_NVIDIA_API_KEY>"

#NOTE: you need the ai21_client for the evaluation so don't delete it
ai21_client = AI21Client(api_key=AI21_API_KEY)
nvidia_client = OpenAI(
  base_url = "https://integrate.api.nvidia.com/v1",
  api_key = NVIDIA_NIM_API_KEY, 
)



default_system_msg = "You are a financial assistant"
default_user_msg   = """based on the following context: {context}\nanswer the following question: {question}"""

# The Challenge

Your task is to modify the code below to achive the best results possible!<br>
Think outside of the box! you can change the defualt prompts, use langchain, add steps costum to the flow or what ever you can think of.<br>
<br>You may:<br>
* Change the helper functions (except *generate_messages()* )
* Add helper functions
* Use the non .md files provided under data folder
* Use any external library you see fit.
* (pre/post) Process the data.
* Use chaining.
* Do what ever makes it work better... 


<br>You may NOT:<br>
* Modify *generate_messages()*

**IMPORTANT Suggetion**<br>
* Use a sub-set( a small sample of the data) to test yourself when making changes. As running and evaluating the full data set may take a while...<br>
(Approx. 10 min for getting the answers, Y min for evaluation)


In [44]:
# Replace/Add your code here
def generate_answers(df: pd.DataFrame, 
                     client = None,
                     model: str = MODEL_JAMBA_MINI,
                     max_tokens: int = 1000,
                     temperature: float = 0.7,
                     format: str = "md_format") -> pd.DataFrame:
    
    if type(client)==OpenAI : # change model name if using nvidia NIM (OpenAI client)
        model = "ai21labs/{}-instruct".format(model)
            
    def process_row(row):
        messages = generate_messages(system_msg=default_system_msg,
                                     user_msg=default_user_msg,
                                     context=row[format],
                                     question=row['question'])
        answer, latency = generate_response(messages, client, model, max_tokens, temperature)
        return pd.Series({'answer': answer.content, 'latency': latency})

    result_df = df.copy()
    
    # Apply the process_row function to each row
    result = result_df.progress_apply(process_row, axis=1)
    
    # Assign the results to new columns
    result_df[['model_answer', 'latency']] = result
    
    return result_df

In [None]:
#NOTE: If you are using NVIDIA NIM, replace the client with nvidia_client
answers_df = generate_answers(df=fin_bench_df, client=ai21_client)

answers_df.to_csv( "model_answers.csv", index=False, encoding='utf-8')

# Evaluate

Your output file should be a csv containing a "model_answer" column and a "gold_answer" column. The evaluation code below reads the file and runs a JudgeLM on the results to evaluate whether they are correct or not.

The scores is the average of the JudgeLM score across your model predictions.

***Baseline results for the provided configuration are around 55%***

In [46]:
import json

from tqdm import tqdm
import time
tqdm.pandas()


jlm_prompt_template = """You are tasked with evaluating the response of a question-answering model. 
You will be given a correct reference answer and the model's prediction, and will need to judge its correctness. The model's prediction may contain reasoning steps and explanations that go beyond the simple reference answer provided. You should judge the actual information content of the model's answer and check whether it aligns with the reference.
Your response must be a valid json dictionary with the key "label" and a value that's either 0 (incorrect) or 1 (correct).

The following are the reference answer and model prediction:

Model prediction:
{model_answer}

Reference answer:
{gold_answer}"""


def evaluate_answer(model_answer:str,
                    gold_answer:str,
                    client,
                    max_retries:int = 5,
                    retry_delay:int = 1):
    try:
        messages = [
            UserMessage(content=jlm_prompt_template.format(model_answer=model_answer, gold_answer=gold_answer), role="user")
        ]

        for attempt in range(max_retries):
            try:
                res = client.chat.completions.create(
                    messages=messages,
                    model="jamba-1.5-large",
                    max_tokens=50,
                    temperature=0.01,
                )
                break
            except Exception as e:
                print(f"Error in JLM: {e}")
            
            time.sleep(retry_delay)
        res = res.choices[0].message.content

        try:
            res = json.loads(res)['label']
            return res
        except json.decoder.JSONDecodeError:
            return None
    
    except Exception as e:
        print(f"Error in JLM: {e}")
        return None


def evaluate_results(df: pd.DataFrame, client: AI21Client, verbose=True):
    assert "model_answer" in df, "model_answer field must be part of the results data"
    assert "gold_answer" in df, "gold_answer field must be part of the results data"
    
    if verbose:
        print(f"Evaluating {len(df)} answers...")
    df['jlm_score'] = df.progress_apply(lambda row: evaluate_answer(row['model_answer'], row['gold_answer'], client), axis=1)
    if verbose:
        print(f"Evaluated {len(df)} answers, final score: {df['jlm_score'].mean()}")
    return df

In [None]:
# run your evaluation data:
df = pd.read_csv("<YOUR_PATH_HERE>")
results_df = evaluate_results(answers_df, ai21_client, verbose=True)
results_df.to_csv("<YOUR_OUTPUT_PATH_HERE>")
