In [55]:
from langchain.document_loaders import UnstructuredPDFLoader, PyPDFDirectoryLoader, PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
import openai
import pandas as pd
import requests
import json
import datetime
import shutil
import os
import chromadb

In [60]:
OPENAI_KEY = os.getenv('OPENAI_KEY')
openai.api_key = OPENAI_KEY

In [61]:
OPENAI_KEY

'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'

In [4]:
fin_stmt_pdf_folder = "financial_statements"
pdfs = os.listdir(fin_stmt_pdf_folder)

In [5]:
pdf_files_path = [fin_stmt_pdf_folder + '\\' + pdf_files for pdf_files in pdfs]

In [7]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
import json
import datetime
import shutil
import openai
import pandas as pd
import requests

In [62]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_KEY)

# Required functions refer to  documentation.








In [12]:
def text_to_openai_json(data,filename):
    """
    Converts a given dataset into a JSON Lines (JSONL) file suitable for OpenAI's GPT-3.5 turbo model.
    
    Args:
        data (DataFrame or similar data structure): Input data containing text and labels.

    The function processes the input data row by row, constructing conversations for each row with a system message, user message, and an assistant message. It then writes the generated conversation data to a JSONL file.
 
    """
    # Initialize an empty list to store conversation data
    message_list = []

    # Iterate through the rows in the input data
    for _, row in data.iterrows():
        # Create a system message as an initial instruction
        system_message = {
            "role": "system",
            "content": "You are a factual chatbot that answers questions about 10-K documents. You only answer with answers you find in the text, no outside information."
        }

        # Append the system message to the conversation
        message_list.append({"messages": [system_message]})

        # Create a user message based on the 'text' column from the data
        user_message = {
            "role": "user",
            "content": row['question']
        }

        # Append the user message to the conversation
        message_list[-1]["messages"].append(user_message)

        # Create an assistant message based on the 'coarse_label' column from the data
        assistant_message = {
            "role": 'assistant',
            "content": row['answer']
        }

        # Append the assistant message to the conversation
        message_list[-1]["messages"].append(assistant_message)

    # Write the conversation data to a JSON Lines (JSONL) file
    with open(filename, "w") as json_file:
        for message in message_list:
            # Serialize the conversation data to JSON and write it to the file
            json.dump(message, json_file)
            json_file.write("\n")

In [13]:
def create_knowledge_hub(path_to_10k):
    """From a 10-K document, create a Chroma DB knowledge hub ONLY PDF FILES.

    Args:
        path_to_10k: Relative path to the 10-K hosted locally on the user's computer

    Returns:
        vectordb: The vector database with the information from the 10-K
        db_directory: The path to the vector database
    """

    now = datetime.datetime.now()
    timestamp = now.strftime("%Y%m%d%H%M%S")
    db_directory = "db/" + timestamp

    loader = PyPDFLoader(path_to_10k)
    documents = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500, 
        chunk_overlap=5,
        separators=["\n\n", "\n", " ", ""],
        length_function=len)
    
    texts = splitter.split_documents(documents)

    vectordb = Chroma.from_documents(
        documents=texts, 
        embedding=embeddings,
        persist_directory=db_directory
    )
    vectordb.persist()

    return vectordb, db_directory

In [14]:
def delete_chroma_db(db_directory):
    """Deletes the Chroma DB created locally on the computer

    Args:
        db_directory: The path to the vector database
    """
    try:
        shutil.rmtree(db_directory)
    except FileNotFoundError:
        print(f"Chroma database '{db_directory}' not found.")
    except Exception as e:
        print(f"Error deleting Chroma database: {str(e)}")

In [15]:
def ask_gpt_nonfinetuned_model(path_to_10k, question):
    """Ask the original GPT model a question based off a local 10-K document.

    Args:
        path_to_10k: Relative path to the 10-K hosted locally on the user's computer
        question: Question to ask the model

    Returns:
        answer: The answer given by the GPT model
    """

    db, db_dir = create_knowledge_hub(path_to_10k)

    print(db, db_dir)


    similarity_search = db.similarity_search(question, k = 5)
    source1 = similarity_search[0].page_content
    source2 = similarity_search[1].page_content
    source3 = similarity_search[2].page_content
    source4 = similarity_search[3].page_content
    source5 = similarity_search[3].page_content
    


    #print(source1, source2, source3, source4, source5)


    completion = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a factual chatbot that answers questions about 10-K documents. You only answer with answers you find in the text, no outside information."},
            {"role": "user", "content": f"{source1}{source2}{source3}{source4}{source5} Now, this is our question: {question}"}
        ]
    )

    
    answer = completion.choices[0].message.content

    delete_chroma_db(db_dir)


    return answer

In [16]:
train_df = pd.read_csv('train.csv')

In [21]:
train_df['financial_pdf_path'].head(3)

0    financial_statements/0001558370-19-000470.pdf
1    financial_statements/0001558370-19-000470.pdf
2    financial_statements/0000066740-23-000014.pdf
Name: financial_pdf_path, dtype: object

In [64]:
syntheses = []
for _, row in train_df.iterrows():
    path_for_fin_stmt = row['financial_pdf_path']  
    question = row['question']
    response = ask_gpt_nonfinetuned_model(path_for_fin_stmt, question)
    print('question{}'.format(question))
    print('response{}' .format(response))
    syntheses.append(response)



"syntheses = []\nfor _, row in train_df.iterrows():\n    path_for_fin_stmt = row['financial_pdf_path']  \n    question = row['question']\n    response = ask_gpt_nonfinetuned_model(path_for_fin_stmt, question)\n\n    print('question{}'.format(question))\n\n    print('response{}' .format(response))\n\n    syntheses.append(response)"

In [72]:
syntheses_df = pd.read_csv('syntheses.csv')

In [91]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [92]:
def compare_strings(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    # Calculate the cosine similarity between the vectors
    similarity = cosine_similarity(vectors)
    return similarity[0][1]

In [104]:
df = pd.read_csv('respones_and_answer.csv')

In [107]:
df

Unnamed: 0,Response,Answer
0,"Based on the provided text, 3M's capital spend...",$1577.00
1,The document provided does not include a balan...,$8.70
2,"Based on the FY2022 data, 3M does appear to be...","No, the company is managing its CAPEX and Fixe..."
3,The operating income margin for 3M for the yea...,Operating Margin for 3M in FY2022 has decrease...
4,The text does not provide specific information...,The consumer segment shrunk by 0.9% organically.
...,...,...
145,The text states that Verizon's strategy requir...,Yes. Verizon's capital intensity ratio was app...
146,"No, Verizon's balance sheet indicates a decrea...",No. Verizon's debt decreased by $229 million.
147,The document does not provide specific informa...,42.69
148,The unadjusted operating income as a percentag...,0.2%


# cosine similarity will evaluate this as 0, which is bad

In [118]:
df['Response'].iloc[0]

"Based on the provided text, 3M's capital spending for 2018 was $1.577 billion."

In [119]:
df['Answer'].iloc[0]

'$1577.00'

In [136]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compare_strings(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    # Calculate the cosine similarity between the vectors
    similarity = cosine_similarity(vectors)
    return similarity[0][1]



In [145]:
def run_cosine_sim(path_to_csv_dataset):
    list_of_cosine_similarity = []

    df = pd.read_csv(path_to_csv_dataset)

    for index, row in df.iterrows():
        
        model_response = row['Response']
        answer = row['Answer']

        #compare similarity
        sim = compare_strings(answer, model_response)
        #print("answers are", answer, model_response)
        #print("sim is", sim)

        

        #add the similarity to the list
        list_of_cosine_similarity.append(sim)

    #get the average of the similarities
    return sum(list_of_cosine_similarity) / len(list_of_cosine_similarity) 

In [146]:
print(f"The total average Euclidean text distance is: {run_cosine_sim('respones_and_answer.csv'):.4f}")


The total average Euclidean text distance is: 0.1928


In [139]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances

def euclidean_text_distance(text1, text2):
    """Compute the Euclidean distance between two texts."""
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    return euclidean_distances(vectors)[0][1]


In [141]:
def run_euclidean_distance(path_to_csv_dataset):
    euclidean_text_distance_list = []

    df = pd.read_csv(path_to_csv_dataset)

    for index, row in df.iterrows():
        model_response = row['Response']
        answer = row['Answer']
        
        #compare similarity
        sim = euclidean_text_distance(answer, model_response)
        #print("answers are", answer, model_response)
        #print("euclidean_text_distance is", sim)

    
        #add the similarity to the list
        euclidean_text_distance_list.append(sim)

    #get the average of the similarities
    return sum(euclidean_text_distance_list) / len(euclidean_text_distance_list) 

In [142]:
    print(f"The total average Euclidean text distance is: {run_euclidean_distance('respones_and_answer.csv'):.4f}")


The total average Euclidean text distance is: 1.2295
