In [6]:
import json, os, time
from datetime import datetime
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing_extensions import Concatenate
from tqdm.auto import tqdm


In [2]:
pc = Pinecone(api_key='')
client = OpenAI(api_key='')

In [None]:
# get transcript json files
def load_json_files(directory, tickers):
    transcripts = []
    for filename in os.listdir(directory):
        for ticker in tickers:
            if filename.endswith(f"{ticker}_transcript.json"):
                with open(os.path.join(directory, filename), 'r') as file:
                    transcripts.append(json.load(file))
    return transcripts
aapl_transcripts = load_json_files('transcripts', ['AAPL'])

In [None]:
# get encoding model for preferred model
tokenizer_name = tiktoken.encoding_for_model('gpt-4o')
tokenizer = tiktoken.get_encoding(tokenizer_name.name)

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [None]:
# create individual chunks from full text
chunks = []

for page in tqdm(aapl_transcripts[0]):
    if len(page['content']) < 200:
        # if page content is short we can skip
        continue
    texts = text_splitter.split_text(page['content'])
    timestamp
    date_obj = datetime.strptime(page['date'], '%Y-%m-%d %H:%M:%S')
    timestamp = int(date_obj.timestamp())
    chunks.extend([{
        'id': f"{page['symbol']}_{page['year']}Q{page['quarter']}_{i}",
        'text': texts[i],
        'symbol': page['symbol'],
        'date': timestamp,
        'chunk': i
    } for i in range(len(texts))])
    

In [None]:
# turn text into embedding vector
embed_model = "text-embedding-3-small"

res = client.embeddings.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], model=embed_model
)

In [None]:
# configure pinecone serverless index
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

index_name = 'earnings-transcripts'

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=len(res.data[0].embedding),
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to newly created index
index = pc.Index(index_name)


In [None]:
batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(chunks), batch_size)):
    # find end of batch
    i_end = min(len(chunks), i+batch_size)
    meta_batch = chunks[i:i_end]
    # get ids
    ids_batch = [x['id'] for x in meta_batch]
    # get texts to encode
    texts = [x['text'] for x in meta_batch]
    # create embeddings (try-except added to avoid RateLimitError)
    try:
        res = client.embeddings.create(input=texts, model=embed_model)
    except:
        done = False
        while not done:
            time.sleep(5)
            try:
                # create embed of text
                res = client.embeddings.create(input=texts, model=embed_model)
                done = True
            except:
                pass
    embeds = [record.embedding for record in res.data]
    # cleanup metadata
    meta_batch = [{
        'text': x['text'],
        'chunk': x['chunk'],
        'symbol': x['symbol'],
        'date': x['date'],
        'id': x['id']
    } for x in meta_batch]
    to_upsert = list(zip(ids_batch, embeds, meta_batch))
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)
# data inserted

In [7]:
# sample query of index
query = 'iPhone sales'

res = client.embeddings.create(
    input=[query],
    model='text-embedding-3-small'
)

# retrieve from Pinecone
xq = res.data[0].embedding

# get relevant contexts with filter on metadata
after_timestamp = datetime(2024,1,1).timestamp()
res = index.query(
    vector=xq, 
    top_k=5, 
    filter={'date': {'$gt': after_timestamp}}, 
    include_metadata=True
)
res['matches']

[{'id': 'AAPL_2024Q1_6',
  'metadata': {'chunk': 6.0,
               'date': 1706847422.0,
               'id': 'AAPL_2024Q1_6',
               'symbol': 'AAPL',
               'text': 'billion. Let me now provide more detail for each of our '
                       'revenue categories. iPhone revenue was $69.7 billion, '
                       'up 6% year-over-year. We set all-time records in '
                       'several countries and regions, including Latin America, '
                       'Western Europe, the Middle East, and Korea, as well as '
                       'December quarter records in India and Indonesia. Our '
                       'iPhone active installed base grew to a new all-time '
                       'high, and we had an all-time record number of iPhone '
                       'upgraders during the quarter. Customers are loving '
                       'their new iPhone 15 family, with the latest reports '
                       'from 451 Research indic

In [8]:
# query data as a single function with sorted return
def earnings_context(symbol: str, query: str, start_date: str, end_date: str = None):

    # turn start/end dates to timestamp int for filter
    start = datetime.strptime(start_date, "%Y-%m-%d").timestamp()
    if end_date == None:
        end = datetime.today().timestamp()
    else:
        end = datetime.strptime(end_date, "%Y-%m-%d").timestamp()

    # embed query
    xq = client.embeddings.create(
        input=[query],
        model="text-embedding-3-small"
    ).data[0].embedding

    # vector search
    res = index.query(
        vector=xq, 
        top_k=5, 
        filter={
            'symbol': {'$eq': symbol.upper()},
            'date': {'$gt': start, '$lt': end}
        }, 
        include_metadata=True
    )

    # prepare return dictionary
    matches = res['matches']
    simplified_data = [{'date': item['metadata']['date'], 'chunk': item['metadata']['chunk'], 'text': item['metadata']['text']} for item in matches]
    sorted_data = sorted(simplified_data, key=lambda x: (x['date'], x['chunk']))
    for item in sorted_data:
        item['date'] = datetime.fromtimestamp(item['date']).strftime('%Y-%m-%d')
    
    return sorted_data
    

In [9]:
# test function
earnings_context('AAPL', 'iPhone sales', '2023-06-25')

[{'date': '2023-08-03',
  'chunk': 1.0,
  'text': "Tim Cook: Thank you, Saori. Good afternoon, everyone, and thanks for joining us. Today, Apple is reporting revenue of $81.8 billion for the June quarter, better than our expectations. We continued to see strong results in emerging markets, driven by robust sales of iPhone with June quarter total revenue records in India, Indonesia, Mexico, the Philippines, Poland, Saudi Arabia, Turkey and the UAE. We set June quarter records in a number of other countries as well, including France, the Netherlands and Austria. And we set an all-time revenue record in Services driven by more than $1 billion paid subscriptions. We continued to face an uneven macroeconomic environment, including nearly 4 percentage points of foreign exchange headwinds. On a constant currency basis, we grew compared to the prior year's quarter in aggregate and in the majority of markets we track. We continue to manage deliberately and innovate relentlessly, and we are driv

In [10]:
# define openai assistant function
function_definition = '''
{
  "name": "earnings_context",
  "description": "Retrieves relevant text from a corporate earnings call transcript to aid in questions that need more specific information. This function takes a stock ticker symbol, a date range, and a query in order to retrieve relevant context from a corporate earnings call. The function retrieves 5 sections of transcript that are 500 tokens in length each.",
  "parameters": {
    "type": "object",
    "properties": {
      "symbol": {
        "type": "string",
        "description": "The stock ticker symbol for the company whos earnings transcript you want to retrieve context from."
      },
      "query": {
        "type": "string",
        "description": "The text query to search for sections of context that match. This uses token similarity in a vector search."
      },
      "start_date": {
        "type": "string",
        "description": "The minimum date of earnings calls you want context from."
      },
      "end_date:": {
        "type": "string",
        "description": "The maximum date for an earnings call you want context from. A default value of today is used if none specified."
      }
    },
    "required": [
      "symbol",
      "query",
      "start_date"
    ]
  }
}'''
tool = json.loads(function_definition)
# add function to assistant definition
assistant_id = ''
client.beta.assistants.update(
        assistant_id,
        tools=[{"type": "function", "function": tool}]
    )
