In [1]:
import tiktoken
import json
import pandas as pd

import os
from openai import AzureOpenAI
from dotenv import load_dotenv
from datetime import datetime

from azure.storage.blob import BlobServiceClient

In [3]:
load_dotenv('./.env')

True

In [5]:

def get_run_id():
    return os.getenv('RUNID') 

RUNID = get_run_id()

RUN_TIME = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')

blob_service_client = BlobServiceClient.from_connection_string(os.getenv('STORAGE_ACCOUNT_CONNECTION_STRING'))

input_container_name = 'relevant-articles-summaries'
output_container_name = 'relevant-articles-summaries-embeddings'

input_container = blob_service_client.get_container_client(input_container_name)
assert input_container.exists(), f"Input container '{input_container_name}' does not exist."
output_container = blob_service_client.get_container_client(output_container_name)
assert output_container.exists(), f"Output container '{output_container_name}' does not exist."

input_blob = input_container.get_blob_client(f'{RUNID}--relevant_articles_summaries.json')
assert input_blob.exists(), f"Input blob '{RUNID}--relevant_articles_summaries.json' does not exist."

print(f"Run ID: {RUNID} at {RUN_TIME}")

Run ID: RUNID_3 at 2025-06-03 15:07:31


In [6]:
AZURE_OPENAI_API_KEY=os.getenv('AZURE_OPENAI_API_KEY')
AZURE_OPENAI_ENDPOINT=os.getenv('AZURE_OPENAI_ENDPOINT')
client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY
)

In [7]:
def get_summaries_to_embed():
    summaries = json.loads(input_blob.download_blob().readall())
    return summaries


In [8]:
summaries = get_summaries_to_embed()

In [9]:
n_summaries_to_embed = len(summaries)
print(f"Number of summaries to embed: {n_summaries_to_embed}")

Number of summaries to embed: 14


In [10]:
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

In [11]:
summaries[0]

{'model': 'gpt-4o-2024-11-20',
 'summary': "Major record labels Universal Music Group, Warner Music Group, and Sony Music Entertainment are in negotiations to license their works to AI startups Udio and Suno, aiming to establish a compensation framework for artists impacted by generative AI technologies. The proposed deals include equity stakes for the record labels and could help resolve ongoing copyright infringement lawsuits filed last year. Udio and Suno's platforms create audio recordings based on user-generated prompts, such as describing specific genres or themes. These discussions highlight the growing tension between creators and AI companies, as labels seek more control over intellectual property while startups push for innovation-friendly terms. For professionals, this reflects a broader challenge in balancing copyright protection with technological advancement in the AI-driven creative economy.",
 'article_id': 'techcrunch_20250603125707805935',
 'run_id': 'RUNID_3',
 'task

In [12]:
chunked_summaries = [[]]

running_token_count = 0

for summary_obj in summaries:

    article_id = summary_obj['article_id']
    summary = summary_obj['summary']
 
    tokens = encoding.encode(summary)
    token_count = len(tokens)

    running_token_count += token_count
    if running_token_count > 8000: # true limit is 8192, but we leave some space for safety
        chunked_summaries.append([])
        running_token_count = 0
    
    chunked_summaries[-1].append({"summary" : summary, "article_id": article_id, "token_count": token_count})

In [None]:
all_embeddings = []

for i,chunk in enumerate(chunked_summaries):
    print(f"embedding chunk {i+1} of {len(chunked_summaries)} with {len(chunk)} summaries")
    chunk_article_ids = [s.get("article_id") for s in chunk]

    embedding_model = "text-embedding-3-small"
    response = client.embeddings.create(
        input=[s.get("summary") for s in chunk],
        model=embedding_model,
    )

    for article_id, item in zip(chunk_article_ids, response.data):
        all_embeddings.append({
            'article_id': article_id,
            'summary_embedding': item.embedding,
            'embedding_model': embedding_model,
        })

embedding chunk 0 of 1 with 14 summaries


In [14]:
pd_embeddings = pd.DataFrame(all_embeddings) 
pd_embeddings.head()

Unnamed: 0,article_id,summary_embedding,embedding_model
0,techcrunch_20250603125707805935,"[-0.008962688967585564, -0.0076769813895225525...",text-embedding-3-small
1,techcrunch_20250603125707826487,"[-0.021791493520140648, -0.006861391477286816,...",text-embedding-3-small
2,techcrunch_20250603125707795688,"[-0.008868522010743618, 0.003896420355886221, ...",text-embedding-3-small
3,business_insider_20250603125707970366,"[0.0115751838311553, 0.0357532799243927, 0.020...",text-embedding-3-small
4,business_insider_20250603125707980679,"[0.011105263605713844, 0.015476743690669537, 0...",text-embedding-3-small


In [15]:
def save_embeddings():
    output_blob_name = f"{RUNID}--relevant_articles_summaries_embeddings.json"
    output_blob_client = output_container.get_blob_client(output_blob_name)
    output_blob_client.upload_blob(json.dumps(all_embeddings, indent=4), overwrite=True)
    print(f"Relevant articles content saved to blob storage as {output_blob_name}")

In [16]:
save_embeddings()

Relevant articles content saved to blob storage as RUNID_3--relevant_articles_summaries_embeddings.json
