In [None]:
!pip install openai --quiet

In [None]:
import openai

EMBEDDING_MODEL = "text-embedding-ada-002"

In [None]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
openai.api_key = os.environ["OPENAI_API_KEY"]

In [None]:
!pip install matplotlib --quiet
!pip install scikit-learn --quiet
!pip install wget --quiet

In [None]:
import numpy as np
import pandas as pd
import wget
import ast

In [None]:
embeddings_path = "https://cdn.openai.com/API/examples/data/winter_olympics_2022.csv"

file_path = "winter_olympics_2022.csv"

if not os.path.exists(file_path):
    wget.download(embeddings_path, file_path)
    print("File downloaded successfully.")
else:
    print("File already exists in the local file system.")

In [None]:
df = pd.read_csv(
    "winter_olympics_2022.csv"
)

# Convert embeddings from CSV str type to Numpy Array
embedding_array = np.array(
    df['embedding'].apply(ast.literal_eval).to_list()
)

In [None]:
from openai.embeddings_utils import get_embedding

query = "curling gold medal"
query_embedding_response = np.array(
    get_embedding(query, EMBEDDING_MODEL)
)

In [None]:
from scipy.spatial.distance import cdist

df['distance'] = cdist(
    embedding_array,
    [query_embedding_response]
)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(df[['distance']])

df['normalised'] = scaler.transform(df[['distance']])

In [None]:
import plotly.express as px
from sklearn.manifold import TSNE
import pandas as pd

# Create a t-SNE model
tsne_model = TSNE(
    n_components = 2,
    perplexity = 15,
    random_state = 42,
    init = 'random',
    learning_rate = 200
)
tsne_embeddings = tsne_model.fit_transform(embedding_array)

# Create a Dataframe for visualisation
visualisation_data = pd.DataFrame(
    {'x': tsne_embeddings[:, 0],
     'y': tsne_embeddings[:, 1],
     'Similarity': df['normalised']}
)

# Create the scatter plot using Plotly Express
plot = px.scatter(
    visualisation_data,
    x = 'x',
    y = 'y',
    color = 'Similarity',
    color_continuous_scale = 'rainbow',
    opacity = 0.3,
    title = f"Similarity to '{query}' visualised using t-SNE"
)

plot.update_layout(
    width = 650,
    height = 650
)

# Show the plot
plot.show()