This is to compare the capability of openai model and Hugging face model to see which model performs better with RAG
This code is to provide visualization for text embeddings in vector search for NLP

We will use: 
1. text_embeddings-ada-002: It has an embedding dimension of 1536. It is designed to capture the semantic meaning of text in a high-dimensional space.
2. gpt 3.5 turbo
3. sklearn.metrics.pairwise.cosine_similarity: : A function from the scikit-learn library that allows us to compute the similarity between vectors.
4. sklearn.decomposition.PCA: Use to perform Principal Component Analysis (PCA), reducing the dimensionality of our embeddings to 2 and 3 dimensions for visualization purposes.
5. mplcursors: An interactive data cursor for matplotlib, which allows us to explore the plots more intuitively.

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import numpy as np
import matplotlib.pyplot as plt 
import mplcursors
from sklearn.decomposition import PCA
from langchain.embeddings.openai import OpenAIEmbeddings
# from langchain_openai import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
from typing import List
%matplotlib ipympl



In [None]:
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAPI_KEY"))


Now we need to define function for plot_2D: Generates 2D plots of our vectorized text data, get_embeddings: Utilizes embedding model to convert text into numerical vectors and compute_cosine_similarity: compare different text examples based on their vector representations using cosine similarity.

In [None]:
def plot_2D(x_values: List[float], y_values: List[float], labels: List[str]) :
    """
    Plots a 2D scatter plot with interactive annotations.

    Parameters:
    x_values (List[float]): A list of x-coordinates for each point.
    y_values (List[float]): A list of y-coordinates for each point.
    labels (List[str]): A list of labels for each point.
    """
    # Create scatter plot
    fig, ax = plt.subplots()
    scatter = ax.scatter(x_values, 
                         y_values, 
                         alpha=0.5, 
                         edgecolors='k',
                         s=40) 

    # Create a mplcursors object to manage the data point interaction
    cursor = mplcursors.cursor(scatter, hover=True)

    # Set aesthetics
    ax.set_title('Embedding visualization in 2D')  # Add a title
    ax.set_xlabel('X_1')  # Add x-axis label
    ax.set_ylabel('X_2')  # Add y-axis label

    # Define how each annotation should look
    @cursor.connect("add")
    def on_add(sel):
        sel.annotation.set_text(labels[sel.target.index])
        sel.annotation.get_bbox_patch().set(facecolor='white', alpha=0.5) # Set annotation's background color
        sel.annotation.set_fontsize(12) 

    plt.show()


In [None]:
def get_embedding(text: str) -> List[float]:
    """
    Retrieves the embedding for a given text.

    Parameters:
    text (str): The text to get the embedding for.

    Returns:
    List[float]: The embedding of the given text.
    """
    response = client.embeddings.create(model="text-embedding-ada-002", input=text)
    
    # response = openai.Embeddings.create(model="text-embedding-ada-002", input=text)
    # return response['data'][0]['embedding']
    return response.data[0].embedding

def compute_cosine_similarity(embeddings: np.ndarray, idx1: int, idx2: int) -> float:
    """
    Computes the cosine similarity between two embeddings.

    Parameters:
    embeddings (np.ndarray): An array of embeddings.
    idx1 (int): The index of the first embedding.
    idx2 (int): The index of the second embedding.

    Returns:
    float: The cosine similarity between the two embeddings.
    """
    return cosine_similarity([embeddings[idx1]],[embeddings[idx2]])[0][0]

In [None]:
animal_sen_1 = "Elephants are known for their exceptional memory and intelligence, often living in complex social structures."
animal_sen_2 = "The blue whale is the largest animal on the planet, with a heart the size of a small car."
animal_sen_3 = "Peregrine falcons are among the fastest birds, reaching over 200 miles per hour during their hunting stoop."


sport_sen_1 = "Soccer, known as football outside of North America, is the world's most popular sport, with a fan base that spans across continents."
sport_sen_2 = "The modern Olympic Games, revived in 1896, are a global event that brings together athletes from over 200 nations."
sport_sen_3 = "Serena Williams has dominated women's tennis with 23 Grand Slam singles titles, making her one of the greatest athletes of all time."


modern_society_sen_1 = "The advent of the internet has transformed modern society, enabling global connectivity and access to information at an unprecedented scale."
modern_society_sen_2 = "Climate change has become one of the most pressing issues of modern society, with a growing consensus on the need for sustainable practices."
modern_society_sen_3 = "Social media has reshaped the way we communicate and consume information, influencing everything from politics to personal relationships."


programming_sen_1 = "C++ programmers harness the power of low-level memory manipulation, crafting efficient and high-performance software for a variety of applications."
programming_sen_2 = "Java developers thrive in building platform-independent solutions, leveraging the language's 'write once, run anywhere' philosophy for versatile and scalable applications."
programming_sen_3 = "Python programmers embrace the language's readability and expressiveness, creating elegant and concise code that facilitates rapid development across diverse domains."

# combine all the examples in one list
all_input_text = [
    animal_sen_1, animal_sen_2, animal_sen_3,
    sport_sen_1, sport_sen_2, sport_sen_3, 
    modern_society_sen_1, modern_society_sen_2, modern_society_sen_3,
    programming_sen_1, programming_sen_2, programming_sen_3
    ]

In [None]:
embeddings = []
for input_text in all_input_text:
    emb = get_embedding(input_text)
    embeddings.append(emb)

embeddings_array = np.array(embeddings)
print("Shape: " + str(embeddings_array.shape), "\n")
print("Sample array:", embeddings_array[0])
