### Setup

In [1]:
import json
import os

import azure.identity
import dotenv
import numpy as np
import openai
import pandas as pd

# Set up OpenAI client based on environment variables
dotenv.load_dotenv()
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_API_ENDPOINT")
AZURE_OPENAI_ADA_DEPLOYMENT = os.getenv("AZURE_OPENAI_ADA_DEPLOYMENT")

azure_credential = azure.identity.DefaultAzureCredential()
token_provider = azure.identity.get_bearer_token_provider(azure_credential,
    "https://cognitiveservices.azure.com/.default")
openai_client = openai.AzureOpenAI(
    api_version="2024-02-01",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    azure_ad_token_provider=token_provider)

def get_embedding(text):
    get_embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=text)
    return get_embeddings_response.data[0].embedding
    
def get_embeddings(sentences):
    embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=sentences)
    return [embedding_object.embedding for embedding_object in embeddings_response.data]


### Vector representations

In [9]:
# optimal size to embed is ~512 tokens
vector = get_embedding("dog") # 8192 tokens limit

vector

[-0.0033353185281157494,
 -0.017689190804958344,
 -0.01590404286980629,
 -0.01751338131725788,
 -0.018054334446787834,
 0.021841011941432953,
 -0.012313461862504482,
 -0.02273358590900898,
 -0.021286534145474434,
 -0.01814900152385235,
 0.012252604588866234,
 0.038759343326091766,
 0.0015408731997013092,
 -0.00691406661644578,
 -0.013638799078762531,
 0.024153590202331543,
 0.039895348250865936,
 0.0012036223197355866,
 0.009372025728225708,
 -0.012178223580121994,
 -0.019853007048368454,
 0.006024873349815607,
 0.011319459415972233,
 -0.025167878717184067,
 -0.00759363966062665,
 0.010284884832799435,
 0.009831836447119713,
 -0.008492975495755672,
 -0.005639444105327129,
 -0.009446406736969948,
 0.007444877177476883,
 -0.009277358651161194,
 -0.025289593264460564,
 -0.02119186706840992,
 -0.005906539969146252,
 -0.018906336277723312,
 -0.007539544254541397,
 -0.016066329553723335,
 -0.01171841286122799,
 -0.02093491330742836,
 0.004608250688761473,
 0.011042220517992973,
 0.0115493647

In [8]:
len(vector)

1536

### Vector comparison

Pamela Fox created this very helpful vector comparison tool: https://pamelafox.github.io/vectors-comparison/

### Document similarity modeled as cosine distance

In [13]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

sentences1 = ['The new movie is awesome',
             'The new movie is awesome',
             'The new movie is awesome']

sentences2 = ['djkshsjdksdfsdfhfsjdfkhsd',
              'This recent movie is so good',
              'The new movie is awesome']

embeddings1 = get_embeddings(sentences1)
embeddings2 = get_embeddings(sentences2)

for i in range(len(sentences1)):
    print(f"{sentences1[i]} \t\t {sentences2[i]} \t\t Score: {cosine_similarity(embeddings1[i], embeddings2[i]):.4f}")

The new movie is awesome 		 djkshsjdksdfsdfhfsjdfkhsd 		 Score: 0.7506
The new movie is awesome 		 This recent movie is so good 		 Score: 0.9191
The new movie is awesome 		 The new movie is awesome 		 Score: 1.0000


### Vector search

In [14]:
# Load in vectors for movie titles
with open('openai_movies.json') as json_file:
    movie_vectors = json.load(json_file)

In [19]:
# Compute vector for query
# The Rock
# Air Bud
# Poltergeist
query = "Air Bud" 

embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=[query])
vector = embeddings_response.data[0].embedding

# Compute cosine similarity between query and each movie title
scores = []
for movie in movie_vectors:
    scores.append((movie, cosine_similarity(vector, movie_vectors[movie])))

# Display the top 10 results
df = pd.DataFrame(scores, columns=['Movie', 'Score'])
df = df.sort_values('Score', ascending=False)
df.head(10)

Unnamed: 0,Movie,Score
266,Air Bud,1.0
132,The Mighty Ducks,0.847521
28,The Fox and the Hound,0.842809
293,A Bug's Life,0.839612
165,The Air Up There,0.83956
391,Teacher's Pet: The Movie,0.836256
221,Toy Story,0.836199
90,Fire Birds,0.833709
531,Disney Planes,0.832589
262,Con Air,0.829287
