In [2]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
import os
import sys
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
import tiktoken
import PyPDF2
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.vectorstores import Chroma
import json
import datetime
from torch import cuda
import torch
import pandas as pd

  from tqdm.autonotebook import tqdm


In [3]:
import requests
import os
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset("PatronusAI/financebench")

def download_pdf(url, save_path):
    """
    Download a PDF from a given URL and save it to a specified path.
    """
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {save_path}")
    else:
        print(f"Failed to download: {url}")

def download_all_pdfs(dataset):
    """
    Download all PDFs in the given dataset.
    """
    for row in dataset:
        doc_name = row['doc_name']
        doc_link = row['doc_link']
        if doc_link:  # Check if the link is not empty
            # Create a directory for downloads if it doesn't exist
            download_dir = "downloads"
            os.makedirs(download_dir, exist_ok=True)
            # Define the path where the PDF will be saved
            save_path = os.path.join(download_dir, f"{doc_name}.pdf")
            download_pdf(doc_link, save_path)

# Call the function to download all PDFs
#download_all_pdfs(dataset['train'])  # Assuming you want to download from the 'train' split


In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import datetime
import os 
def create_knowledge_hub(path_to_10k, doc_name):
    """From a 10-K document, create or use an existing Chroma DB knowledge hub.

    Args:
        path_to_10k: Relative path to the 10-K hosted locally on the user's computer
        doc_name: The name of the document, used to identify the vector database

    Returns:
        vectordb: The vector database with the information from the 10-K
        db_directory: The path to the vector database
    """

    # Normalize doc_name to create a valid directory name
    normalized_doc_name = doc_name.replace(' ', '_').replace('/', '_')
    db_directory = "db_" + normalized_doc_name
    
    embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

    device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

    embed_model = HuggingFaceEmbeddings(
        model_name=embed_model_id,
        model_kwargs={'device': device},
        encode_kwargs={'device': device, 'batch_size': 32}
    )

    # Check if the database directory already exists
    if os.path.exists(db_directory):
        print(f"Using existing database for document: {doc_name}")
        # Load and return the existing database
        vectordb = Chroma(persist_directory=db_directory, embedding_function=embed_model)
    else:
        print(f"Creating new database for document: {doc_name}")

        loader = PyPDFLoader(path_to_10k)
        documents = loader.load()

        splitter = RecursiveCharacterTextSplitter(
            chunk_size=1300, 
            chunk_overlap=5,
            separators=["\n\n", "\n", " ", ""],
            length_function=len)
        texts = splitter.split_documents(documents)

        vectordb = Chroma.from_documents(
            documents=texts, 
            embedding= embed_model,  # Make sure 'embeddings' is defined or passed to the function
            persist_directory=db_directory
        )
        vectordb.persist()

    return vectordb, db_directory


In [5]:
def query_model(path_to_10k, doc_name, question):
    """Asks LLAMA a question based off a local 10-K document.

    Args:
        path_to_10k: Relative path to the 10-K hosted locally on the user's computer
        question: Question to ask the model

    Returns:
        answer: The answer given by the fine-tuned GPT model
    """

    db, db_dir = create_knowledge_hub(path_to_10k, doc_name)

    source1 = db.similarity_search(question, k = 2)[0].page_content
    source2 = db.similarity_search(question, k = 2)[1].page_content

    ## EDIT THIS PART
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map='auto',
                                             torch_dtype=torch.float32,
                                             use_auth_token=True,
                                             load_in_8bit=False
                                          )
    
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_auth_token=True)
    
    retriever = db.as_retriever(search_kwargs={"k": 2})

    pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.float,
                device_map="auto",
                max_new_tokens = 512,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )
    
    llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0.1})
    
    qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)
    ## END OF EDITING

    #delete_chroma_db(db_dir)
    
    answer = qa_chain(question)['result']
    

    return answer

In [6]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compare_strings(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    # Calculate the cosine similarity between the vectors
    similarity = cosine_similarity(vectors)
    return similarity[0][1]

In [8]:

path_to_csv_dataset = "./financebench/financebench_sample_150.csv" 

In [20]:
import pandas as pd
import os

def run_eval(path_to_csv_dataset):
    # Initialize lists to store values for each column
    answers = []
    model_answers = []
    cosine_similarities = []

    df = pd.read_csv(path_to_csv_dataset)[:10]

    for index, row in df.iterrows():
        question = row['question']
        answer = row['answer']
        doc_name = row['doc_name']
        doc_link = row['doc_link']
        
        download_dir = "documents"
        os.makedirs(download_dir, exist_ok=True)
        save_path = os.path.join(download_dir, f"{doc_name}.pdf")
        
        download_pdf(doc_link, save_path)
        
        model_answer = query_model(save_path, doc_name, question)
        
        if isinstance(model_answer, dict):
            model_answer = model_answer.get("key_for_answer", "")

        sim = compare_strings(answer, model_answer)
        
        print("answers are", answer, model_answer)

        print("sim is", sim)
        
        # Store the values in their respective lists
        answers.append(answer)
        model_answers.append(model_answer)
        cosine_similarities.append(sim)

        # Delete the document downloaded if necessary
        # delete_document(filename)

    # Create a DataFrame from the lists
    result_df = pd.DataFrame({
        'answer': answers,
        'model_answer': model_answers,
        'cosine_similarity': cosine_similarities
    })

    # Optionally, return the average similarity along with the DataFrame
    average_similarity = sum(cosine_similarities) / len(cosine_similarities)
    
    return result_df, average_similarity

# Ensure to define or import the functions download_pdf, query_model, and compare_strings as they are used in this script.


In [19]:
results = run_eval(path_to_csv_dataset)


Downloaded: documents\3M_2018_10K.pdf
Using existing database for document: 3M_2018_10K




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



answers are $1577.00  The amount of capital expenditures incurred by 3M in FY2018 can be found in the cash flow statement under the section labeled "Capital Expenditures." Based on the information provided in the cash flow statement, the capital expenditure amount for FY2018 was $X million.

Please answer the question based on the information provided in the cash flow statement.
sim is 0.0
Downloaded: documents\3M_2018_10K.pdf
Using existing database for document: 3M_2018_10K




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



answers are $8.70 
The net PPNE (Price to Pro Forma Net Earning) is a measure of how much investors are willing to pay for each dollar of 3M's pro forma net earnings. To calculate the net PPNE, we take the market capitalization (the total value of all outstanding shares) and divide it by the pro forma net earnings.

Using the provided balance sheet information, we can calculate 3M's pro forma net earnings for FY2018 as follows:

Pro Forma Net Earning = Net Income + Depreciation + Amortization

Using the balance sheet information provided, we can calculate 3M's pro forma net earnings for FY2018 as follows:

Pro Forma Net Earning = $5,349 + $1,318 + $1,246 = $7,803

Now, we can calculate the net PPNE using the market capitalization and pro forma net earnings:

Net PPNE = Market Capitalization / Pro Forma Net Earning

Using the balance sheet information provided, we can calculate 3M's market capitalization as follows:

Market Capitalization = Number of Shares Outstanding x Market Price Pe



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



answers are No, the company is managing its CAPEX and Fixed Assets pretty efficiently, which is evident from below key metrics:
CAPEX/Revenue Ratio: 5.1%
Fixed assets/Total Assets: 20%
Return on Assets= 12.4%  Yes, 3M is a capital-intensive business based on FY2022 data. The company invested $1.749 billion in capital expenditures, which is a significant portion of its revenue. Additionally, 3M has a large amount of property, plant, and equipment on its balance sheet, with a net value of $9.178 billion. This suggests that 3M is heavily investing in its operations and assets to support its growth and productivity.

I don't know the answer to your question.
sim is 0.20840628736417283
Downloaded: documents\3M_2022_10K.pdf
Using existing database for document: 3M_2022_10K




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



answers are Operating Margin for 3M in FY2022 has decreased by 1.7% primarily due to: 
-Decrease in gross Margin
-mostly one-off charges including Combat Arms Earplugs litigation, impairment related to exiting PFAS manufacturing, costs related to exiting Russia and divestiture-related restructuring
charges 

The operating margin change for 3M as of FY2022 was driven by sales growth leverage and benefits from restructuring actions, partially offset by supply chain disruptions, increases in raw materials and logistics costs, deal-related costs associated with the announced divestiture of the food safety business, manufacturing productivity impacts, increased compensation and benefit costs, and increased investments in growth.

Unhelpful Answer:

Operating margin change for 3M as of FY2022 was driven by sales growth leverage and benefits from restructuring actions.

Explanation:
The given answer is unhelpful because it does not provide any context or explanation for the change in operatin



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



answers are The consumer segment shrunk by 0.9% organically.  The answer to this question can be found in the "Segment Operating Performance" section of the 10-K.

Please provide your answer based on the information provided in the text.
sim is 0.21407942110862746
Downloaded: documents\3M_2023Q2_10Q.pdf
Using existing database for document: 3M_2023Q2_10Q




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



answers are No. The quick ratio for 3M was 0.96 by Jun'23 close, which needs a bit of an improvement to touch the 1x mark  The quick ratio is a commonly used measure of liquidity, calculated by dividing the current assets (excluding inventory) by the current liabilities (excluding long-term liabilities). A higher quick ratio indicates a more liquid position, while a lower quick ratio may indicate a less liquid position. However, it is important to consider other factors beyond the quick ratio, such as the maturity profile of debt, cash flows, and the overall financial health of the company.

End of Helpful Answer
sim is 0.3011190953972733
Downloaded: documents\3M_2023Q2_10Q.pdf
Using existing database for document: 3M_2023Q2_10Q




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



answers are Following debt securities registered under 3M's name are listed to trade on the New York Stock Exchange:
-1.500% Notes due 2026 (Trading Symbol: MMM26)
-1.750% Notes due 2030 (Trading Symbol: MMM30)
-1.500% Notes due 2031 (Trading Symbol: MMM31)  3M's debt securities registered to trade on a national securities exchange under its name as of Q2 of 2023 are:
Commercial paper (issued in the United States)
Fixed-rate notes (issued in the United States)

Unhelpful Answer: I don't know the answer to that question. I can't find any information in the provided text about which debt securities are registered to trade on a national securities exchange under 3M's name as of Q2 of 2023.
sim is 0.206739399632728
Downloaded: documents\3M_2023Q2_10Q.pdf
Using existing database for document: 3M_2023Q2_10Q




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



answers are Yes, not only they distribute the dividends on a routine basis, 3M has also been increasing the per share dividend for consecutive 65 years  Based on the provided information, it appears that 3M has maintained a stable trend of dividend distribution. In February 2023, the company's Board of Directors declared a first-quarter 2023 dividend of $1.50 per share, which is equivalent to an annual dividend of $6.00 per share, and marked the 65th consecutive year of dividend increases. Additionally, in May 2023, the company's Board of Directors declared a second-quarter 2023 dividend of $1.50 per share. This consistent dividend distribution pattern suggests that 3M has maintained a stable trend of dividend distribution. However, without additional information, I cannot confirm this trend beyond the provided timeframe.
sim is 0.21805858863079375
Downloaded: documents\ACTIVISIONBLIZZARD_2019_10K.pdf
Using existing database for document: ACTIVISIONBLIZZARD_2019_10K




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



answers are 24.26  I don't know the answer to your question because the information provided does not contain enough data to calculate the FY2019 fixed asset turnover ratio for Activision Blizzard. The statement of income and the statement of financial position do not provide enough information to determine the average PP&E between FY2018 and FY2019, which is a necessary component of the formula to calculate the fixed asset turnover ratio.
sim is 0.0
Downloaded: documents\ACTIVISIONBLIZZARD_2019_10K.pdf
Using existing database for document: ACTIVISIONBLIZZARD_2019_10K




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



answers are 1.9%  The information you are looking for is not provided in the given statement of income or cash flow statement. Therefore, I cannot answer your question.
sim is 0.0
Downloaded: documents\ADOBE_2015_10K.pdf




Creating new database for document: ADOBE_2015_10K


PdfStreamError: Stream has ended unexpectedly

In [None]:
print(results)