In [None]:
import os
from facts import cat_facts, dog_facts, python_language_facts, python_snake_facts, lion_facts
from open_ai_connector import OpenAiConnector
from dotenv import load_dotenv
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_distances
import numpy as np
from visualizer import Visualizer

load_dotenv()

In [None]:
#setup
llm = OpenAiConnector(os.getenv("OPENAI_API_KEY"))

In [None]:
llm.text_completion("Say hello to ThaiPy!")

## Step 1 - Review and prepare data

In [None]:
facts_dict = {
    "python_language_facts": python_language_facts,
    "python_snake_facts": python_snake_facts,
    "dog_facts": dog_facts,
    "cat_facts": cat_facts
}

# Create a dataframe with our data
facts_list = [(category, fact) for category, facts in facts_dict.items() for fact in facts]
df = pd.DataFrame(facts_list, columns=["category", "fact"])
df

## Step 2 -Embeddings

In [None]:
embeddings = llm.batch_embed(df['fact'].tolist())
df_with_embeddings = df.assign(embedding=embeddings)


In [None]:
embeddings

## Step 3 - Dimension reduction and visualization

### this is done with the "T-distributed stochastic neighbor embedding" TSNE algorithm

In [None]:
def reduce_dimensions(embeddings, dimensions=2):
    tsne = TSNE(n_components=dimensions, random_state=42, init="pca")
    reduced_embeddings = tsne.fit_transform(embeddings)
    return reduced_embeddings


reduced_embeddings_2d = reduce_dimensions(embeddings=np.array(embeddings), dimensions=2)
df_with_2d_embeddings = df.assign(x=reduced_embeddings_2d[:, 0], y=reduced_embeddings_2d[:, 1])

### Inspect the data

In [None]:
reduced_embeddings_2d[:3]

In [None]:
df_with_2d_embeddings.head(3)

In [None]:
Visualizer.scatter(df_with_2d_embeddings)

### Lets try 3 dimensions, just for fun

In [None]:
reduced_embeddings_3d = reduce_dimensions(embeddings=np.array(embeddings), dimensions=3)
df_with_3d_embeddings = df.assign(
    x=reduced_embeddings_3d[:, 0],
    y=reduced_embeddings_3d[:, 1],
    z=reduced_embeddings_3d[:, 2]
)


In [None]:
Visualizer.scatter(df_with_3d_embeddings)

## Step 4 - create some utils

In [None]:
def add_texts_to_df_and_plot(texts, category, df_to_add_to, llm, dimensions=2, is_visualization_enabled=True):
    embedded_texts = llm.batch_embed(texts)
    new_data = {
        'category': [category] * len(texts),
        'fact': texts,
        'embedding': embedded_texts
    }
    new_df = pd.DataFrame(new_data)
    combined_df = pd.concat([df_to_add_to, new_df], ignore_index=True)
    all_embeddings = combined_df['embedding'].tolist()
    reduced_embeddings = reduce_dimensions(np.array(all_embeddings), dimensions)
    combined_df[['x', 'y']] = reduced_embeddings

    if is_visualization_enabled:
        Visualizer.scatter(combined_df)
    return embedded_texts, combined_df


embedded_texts, df_with_lion_facts = add_texts_to_df_and_plot(lion_facts, "lion_facts", df_with_embeddings, llm)


note - different texts with the same/similar meaning will be aligned close to each other

In [None]:
question_embedding, result = add_texts_to_df_and_plot(["how much sleep a cat need"], "question", df_with_lion_facts, llm)
#question_embedding, result = add_texts_to_df_and_plot(["how much resting time does a cat need"], "question", df_with_lion_facts, llm)


#even something absurd like this
#question_embedding, result = add_texts_to_df_and_plot(["what is the quantity of time being not awake for a meowing 4 legged mammal to have"], "question", df_with_lion_facts, llm)

## Step 5 - calculate distances

In [None]:
#result.iloc[53]
#cosine_distances(question_embedding, [result['embedding'][53]])

distance_list = cosine_distances(question_embedding, result['embedding'].tolist())[0].tolist()
distance_list

In [None]:

df_with_distances = result.assign(distance=distance_list)
df_with_distances.sort_values(by='distance', inplace=True)

df_with_distances[['fact', 'distance']]



## Final step - put it all together

In [None]:
def get_n_closest_texts(question, df, n=8, llm=llm):
    question_embedding, result = add_texts_to_df_and_plot([question], "question", df, llm)
    distance_list = cosine_distances(question_embedding, result['embedding'].tolist())[0].tolist()
    df_with_distances = result.assign(distance=distance_list)
    df_with_distances.sort_values(by='distance', inplace=True)
    return df_with_distances['fact'].tolist()[:n]


def ask_contexted_questions(question, related_texts, llm=llm):
    prompt = f"""
    Context: {related_texts}

    Based on the context above, answer the following question.
    If the answer is in the context, include the provided context pieces in your answer:

    Question: {question}\nAnswer:"""
    return llm.text_completion(prompt)


def execute_visualized_rag(question, context_df, llm=llm):
    n_closest_texts = get_n_closest_texts(question, context_df)

    print("RAG-Answer:\n##############")
    print(ask_contexted_questions(question, related_texts=n_closest_texts, llm=llm))
    print("##############")
    for text in n_closest_texts:
        print(text)



In [None]:
execute_visualized_rag("how much sleep a cat need", context_df=df_with_lion_facts)

In [None]:
execute_visualized_rag("what is isaac newton known for?", context_df=df_with_lion_facts)

In [None]:
from facts import thaipy_facts
embedded_thaipy_facts, df_with_thaipy_facts = add_texts_to_df_and_plot(thaipy_facts, "thaipy_facts", df_with_lion_facts, llm, is_visualization_enabled=False)

In [None]:
execute_visualized_rag("what is thaipy", context_df=df_with_thaipy_facts)