In [None]:
import os
import re
import pandas as pd
import openai
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.documents import Document
from langchain.chains import create_retrieval_chain
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.output_parsers import StrOutputParser


1. Data Collection: Financial News, Earnings Calls, Reports are stored in the databse, in txt files

In [None]:
#Data Coding

2. Text-splitting, Embedding Creation, Vector Database

In [None]:
os.environ['OPENAI_API_KEY'] = 'API KEY'
def process_in_batches(documents, embeddings, batch_size): #Store the Embeddings in Vector Database in batches
    vector_store = None
    i = 0
    while i < len(documents):
      batch = documents[i:i+len(documents) // batch_size]
      i += len(documents) // batch_size
      batch_vector_store = FAISS.from_documents(batch, embeddings)
      if vector_store is None:
        vector_store = batch_vector_store
      else:
        vector_store.merge_from(batch_vector_store)
    return vector_store


myfolder = 'Data Folder'
file_path = os.path.join(myfolder, 'faiss_index_20')
if os.path.exists(file_path):
  print(f"The file already exists in the folder '{myfolder}'.")
  continue
else:
  #Create the Emebeddings (3-small/3-large)
  myembeddings = OpenAIEmbeddings(model='text-embedding-3-small')
  #Probably: Text splitter
  text_splitter_20 = RecursiveCharacterTextSplitter(
    chunk_size=16000, #Length of each chunk
    chunk_overlap=800 #Length of the overlap paragraph, avoid information missing
  )
  #Check the available loaders
  #loader = DirectoryLoader(myfolder, glob='*.txt')
  text_loader_kwargs = {"autodetect_encoding": True}
  loader = DirectoryLoader(myfolder, glob='*.txt', loader_cls=TextLoader, loader_kwargs=text_loader_kwargs, silent_errors=True)
  docs = loader.load()
  documents_20 = text_splitter_20.split_documents(docs)
  #Create the Vector Store
  vector_20 = process_in_batches(documents_20, myembeddings,10)
  vector_20.save_local(f"{myfolder}/faiss_index_20")



3. Prompt Design: CoT Prompts (Role Description, Scoring Rubrics, Examples)

In [None]:
factor = 'Prompt name'
file_path = 'Prompt File'
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
except UnicodeDecodeError:
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        content = file.read()

print(content)

4. Naive RAG Model
(In total 5 factors. For each factor, n paragraphs of context is retrieved from database that are most relevant with the query)

In [None]:
llm = ChatOpenAI(temperature=0, model='gpt-4o-mini')
embeddings_small = OpenAIEmbeddings(model='text-embedding-3-small')
output_parser = StrOutputParser()

myfolder = f'data file for specific company at specific year'
file_path = os.path.join(myfolder, 'faiss_index_20')
if os.path.exists(file_path):
  #Load vector from the drive
  myvector_20 = FAISS.load_local(f"{myfolder}/faiss_index_20", embeddings_small,allow_dangerous_deserialization=True)
  retriever = myvector_20.as_retriever()
  score_list = []
  for i in range(5):
    prompt = ChatPromptTemplate.from_messages([
      #Prompt Design, Confiential
    ])
    #Connect LLM and prompt
    document_chain = create_stuff_documents_chain(llm, prompt)
    #make the vectors into retriever
    #input prompt for the retriever to retrieve data，output the data to the chain
    retrieval_chain = create_retrieval_chain(retriever, document_chain)
    #content is the prompt
    response = retrieval_chain.invoke({
        "input":  ,
        "year":
    })
  score = score_list[0] * 0.3 + score_list[1] * 0.2 + score_list[2] * 0.05 + score_list[3] * 0.15 + score_list[4] * 0.30
  company_score_list.append(score)
  print(company, year, 'score: ', score, 'Market: ', score_list[0], 'Product: ', score_list[1], 'Operation: ', score_list[2], 'Strategy: ', score_list[3], 'Adaptability: ', score_list[4])
else:
  company_score_list.append(-1)
  print(company, year, 'No Data')


5. Advanced RAG model (re-rank, query rewriting, AI-agent, Finetune)

In [None]:
#re-rank
from FlagEmbedding import FlagReranker
# Re-rank according to similarity scores
    reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)
    # List to store scores and documents
    scored_docs = []
    # Calculate the reranking score for each document
    for doc in retrieved_docs:
        score = reranker.compute_score([content, str(doc)], normalize=True)  # Compute the score for the document
        scored_docs.append((doc, score))  # Store the document and its score
    # Sort the documents by score in descending order
    scored_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True)
    # Take the top 4 documents based on the score
    top_4_docs = [doc for doc, score in scored_docs[:4]]
    # Create a new FAISS vector store with the top 4 documents
    new_vector_store = FAISS.from_documents(top_4_docs, embeddings_small)
    # Create a new retriever based on the top 4 documents
    refined_retriever = new_vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [None]:
#query rewriting
from langchain.chains import HypotheticalDocumentEmbedder, LLMChain
from scipy.spatial.distance import cosine
#Prompt Design
    multi_llm = ChatOpenAI(n=4)

    embeddings = HypotheticalDocumentEmbedder.from_llm(
        multi_llm, embeddings_small, "web_search"
    )

    # Use the embedding function to get relevant documents
    query_embedding = embeddings.embed_query(content)

    retrieved_docs = retriever.get_relevant_documents(content)

    # Re-rank according to similarity scores

    # List to store scores and documents
    scored_docs = []
    # Calculate the reranking score for each document
    for doc in retrieved_docs:
      doc_embedding = embeddings.embed_documents(doc.page_content)[0]  # get document embeddings
      # calculate similarity
      score = 1-cosine(query_embedding, doc_embedding)  # use cosine similarity
      scored_docs.append((doc, score))
      print(score)
    # Sort the documents by score in descending order
    scored_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True)
    # Take the top 4 documents based on the score
    top_4_docs = [doc for doc, score in scored_docs[:4]]
    # Create a new FAISS vector store with the top 4 documents
    new_vector_store = FAISS.from_documents(top_4_docs, embeddings_small)
    # Create a new retriever based on the top 4 documents
    refined_retriever = new_vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [None]:
#Finetune

#Set-up Training data and Validation data
import json
np.random.seed(0)
yearlist = list(range(2014,2024))
mycolumns = scoredf.columns
training_data = []
for promptidx in range(len(promptlist)):
  for rowidx in range(len(scoredf)//4):
    system_message = {"role": "system", "content": promptlist[promptidx]}
    user_message = {"role": "user", "content": f'Tell me the {mycolumns[promptidx+1]} score of {scoredf.iloc[rowidx, 0]} in year {yearlist[rowidx % 10]}.'}
    if scoredf.iloc[rowidx, promptidx+1] < 0:
      continue
    randomscore = scoredf.iloc[rowidx, promptidx+1] + np.random.randint(-10,10)
    if randomscore < 0:
      randomscore = 0
    if randomscore > 100:
      randomscore = 100
    assistant_message = {"role": "assistant", "content": f'the score of {scoredf.iloc[rowidx, 0]} in year {yearlist[rowidx % 10]} for {mycolumns[promptidx+1]} is {randomscore}.'}
    messages = [system_message, user_message, assistant_message]
    training_data.append({"messages": [system_message, user_message, assistant_message]})
def write_to_jsonl_file(data, file_path):
    """
    Write the data to a .jsonl file.
    """
    with open(file_path, 'w') as f:
        for entry in data:
            json.dump(entry, f)  # Convert the dictionary to JSON and write it
            f.write('\n')  # Ensure each JSON object is on a new line
    print(f"Data successfully written to {file_path}")
file_path = "/content/drive/My Drive/24 Summer - RAGer/Code/RAG Pipeline/Pipeline Phase 2/Finetune Tests/automated_large_training_data_CEO.jsonl"
write_to_jsonl_file(training_data, file_path)

#Create Fine-tune model in OpenAI

In [None]:
#AI Agent
os.environ["TAVILY_API_KEY"] = "API Key"
search = TavilySearchResults()
tools = [search, retriever_tool]
agent = create_tool_calling_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
response = agent_executor.invoke({
        "input": content,
        "year": year,
        'context': context
})

6.Combined Structure

In [None]:
myfolder = f'data file for specific company at specific year'
llm = ChatOpenAI(temperature=0, model='fine-tune model')
embeddings_small = OpenAIEmbeddings(model='text-embedding-3-small')
#Load vector from the drive
if os.path.exists(f"{myfolder}/faiss_index_20/index.faiss"):
  myvector_20 = FAISS.load_local(f"{myfolder}/faiss_index_20", embeddings_small,allow_dangerous_deserialization=True)
  retriever = myvector_20.as_retriever()

  #Prompt Design
  multi_llm = ChatOpenAI(n=4)

  embeddings = HypotheticalDocumentEmbedder.from_llm(
      multi_llm, embeddings_small, "web_search"
  )

  # Use the embedding function to get relevant documents
  query_embedding = embeddings.embed_query(content)

  retrieved_docs = retriever.get_relevant_documents(content)

  # Re-rank according to similarity scores

  # List to store scores and documents
  scored_docs = []
  # Calculate the reranking score for each document
  for doc in retrieved_docs:
    doc_embedding = embeddings.embed_documents(doc.page_content)[0]  # get document embeddings
    # calculate similarity
    score = 1-cosine(query_embedding, doc_embedding)  # use cosine similarity
    scored_docs.append((doc, score))
    print(score)
  # Sort the documents by score in descending order
  scored_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True)
  # Take the top 4 documents based on the score
  top_4_docs = [doc for doc, score in scored_docs[:4]]
  # Create a new FAISS vector store with the top 4 documents
  new_vector_store = FAISS.from_documents(top_4_docs, embeddings_small)
  # Create a new retriever based on the top 4 documents
  refined_retriever = new_vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

  retriever_tool = create_retriever_tool(refined_retriever,'Finance_Data_File_Retriever', 'Retrieve similar files from data base according to user message.')
  tools = [search, retriever_tool]

  #Prompt Design
  score_list = []
  prompt = ChatPromptTemplate.from_messages([
      'Prompt with Context, query and year/company information'
  ])
  prompt.partial(agent_scratchpad=lambda x: x)

  #Connect LLM and prompt
  document_chain = create_stuff_documents_chain(llm, prompt)
  agent = create_tool_calling_agent(llm, tools, prompt)
  agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
  relevant_docs = retriever.get_relevant_documents(content)
  context = " ".join([doc.page_content for doc in relevant_docs])

  response = agent_executor.invoke({
      "input": content,
      "year": year,
      'context': context
  })

  output_parser = StrOutputParser()
  score_chat_history = [HumanMessage(content=content), AIMessage(content=response["output"])]
  score_prompt = ChatPromptTemplate.from_messages([
      MessagesPlaceholder(variable_name="chat_history"),
      ("user", f"{input}"),
      ("system", 'Only return one final score number, do not return anything else!')])
  score_chain = score_prompt | llm | output_parser
  response3 = score_chain.invoke({
      "chat_history": score_chat_history,
  })
  score_list.append(response3)

  print(company, year)
  print('Score: ', score_list[0])

  # Append the results to the list
  results.append({
      'Company': company,
      'Year': year,
      'If retrieve': 'Yes',
      'Score': score_list[0],
  })