In [1]:
from openai import OpenAI
from elasticsearch import Elasticsearch
import os
from tqdm.auto import tqdm
import json

In [2]:
client = OpenAI()

In [3]:
es_client = Elasticsearch('http://localhost:9200')

In [7]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "ticker": {"type": "keyword"} 
        }
    }
}

index_name = "us-stock-q&a"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'us-stock-q&a'})

In [8]:

with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

documents = []

for ticker_dict in docs_raw:
    for doc in ticker_dict['documents']:
        doc['ticker'] = ticker_dict['ticker']
        documents.append(doc)

In [9]:
documents[1]

{'text': "The Company's fiscal year ends on the last Saturday of September.",
 'section': '2023_10K',
 'question': "When does the Company's fiscal year end?",
 'ticker': 'AAPL'}

In [10]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/6997 [00:00<?, ?it/s]

In [11]:
def build_prompt(query, search_results):
    prompt_template = """
You're a investment advicer. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [12]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [13]:
def elastic_search(query, ticker):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "ticker": ticker
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [14]:
def rag(query, ticker):
    search_results = elastic_search(query, ticker)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [17]:
query = "How about the company's financial health?"
ticker = "ABNB"
rag(query, ticker)

"The company's financial health appears robust based on the provided information. Key indicators include a substantial 26% growth in Adjusted EBITDA in 2023, which suggests strong operational performance. Additionally, the company believes that pending legal matters will not materially affect its financial condition. Moreover, healthy free cash flow implies sufficient cash generation from operations to fund strategic initiatives and fortify the balance sheet. Overall, these factors point to a positive financial outlook for the company."