<a href="https://colab.research.google.com/github/Tomershigani/-RAG-pipeline/blob/main/Data_Lab_Q3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install sentence_transformers





In [3]:
!pip install datasets



In [4]:
!pip install pinecone-client



In [5]:
!pip install cohere



In [6]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import tqdm, trange


In [7]:
with open("cohorapikey.txt") as f:
    COHERE_API_KEY = f.read().strip()
with open("pinecone_api_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

In [8]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
def load_and_embedd_dataset(
        dataset_name: str = "huggingartists/lady-gaga",
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'text',
        rec_num: int = 723
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """
    from datasets import load_dataset

    print("Loading and embedding the dataset")

    # Load the dataset
    # dataset = load_dataset(dataset_name, '3.0.0', split=split)
    dataset = load_dataset(dataset_name,'default', split=split)

    # Embed the first `rec_num` rows of the dataset
    embeddings = model.encode(dataset[text_field][:rec_num])

    print("Done!")
    return dataset, embeddings

In [10]:
DATASET_NAME = "huggingartists/lady-gaga"

dataset, embeddings = load_and_embedd_dataset(
    dataset_name=DATASET_NAME,
    rec_num=723,
    model=model,
)
shape = embeddings.shape

Loading and embedding the dataset


Downloading data:   0%|          | 0.00/409k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/723 [00:00<?, ? examples/s]

Done!


In [11]:
pd_dataset = dataset.to_pandas()
pd_dataset.head(5)

Unnamed: 0,text
0,"Tell me somethin, girl\nAre you happy in this ..."
1,Mum mum mum mah\nMum mum mum mah\nMum mum mum ...
2,I didnt ask for a free ride\nI only asked you ...
3,"Oh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh\nCaught ..."
4,That Arizona sky\nBurnin in your eyes\nYou loo...


In [12]:
print(f"The embeddings shape: {embeddings.shape}")

The embeddings shape: (723, 384)


## Second Element - Vector Database


In [13]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [14]:
INDEX_NAME = "huggingartists-lady-gaga"

# Create the vector database
# We are passing the index_name and the size of our embeddings
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


In [15]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'text',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


In [16]:
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Upserting the embeddings to the Pinecone index...


100%|██████████| 6/6 [00:02<00:00,  2.08it/s]


In [17]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 723}},
 'total_vector_count': 723}

## Third Element - LLM


In [18]:
import cohere

#First lets write a query for the LLM
query = "which lady gaga song contain the phrase That Arizona sky Burnin in your eyes ? "
co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query,
    )
response.text

'The phrase "That Arizona sky, burnin\' in your eyes" is from the song "Born This Way" by Lady Gaga.'

In [19]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['text'] for match in query_results]

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [20]:
# Let us remember our query
query = "which lady gaga song contain the phrase That Arizona sky Burnin in your eyes ?"
augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
response.text

'Always Remember Us This Way'

In [21]:
print(source_knowledge)

That Arizona sky
Burnin in your eyes
You look at me and babe I wanna catch on fire
It’s buried in my soul
Like California gold
You found the light in me that I couldn’t find
So when Im all choked up and I cant find the words
Dreams of you and me are in the dirt
When the sun goes down
And the band wont play
Ill always remember us this way

SetlistLady Gaga
- God Bless America / This Land Is Your Land/ Pledge of Allegiance 
- Poker Face
- Born This Way
- Telephone
- Just Dance
- Million Reasons
- Bad Romance
Medley Intro
- Lady Gaga also includes a medley intro you can hear when Gaga lowers down onto the tower, the medley includes music from Dance in the Dark, vocals from Just Dance, LoveGame and Paparazzi.Lyrics PerformedLady Gaga:
God bless America
Land that I love
Stand beside her and guide her
Through the night with a light from above
This land is your land, this land is my land
This land was made for you and me
One nation under God, indivisible, with liberty and justice for all...
I

In [22]:
query2 = "which lady gaga song contain the phrase: I dont wanna talk anymore "
co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query2,
    )
response.text

'Lady Gaga\'s song "Poker Face" contains the phrase "I don\'t wanna talk anymore". The relevant lyrics are: \n\n> Po-po-po-Poker Face, Po-po-po-Poker Face\n> (I wanna roll with him, a hard pair we will be)\n> A little gambling is fun when you\'re with me, I love it\n> (Russian Roulette is not the same without a gun)\n> And, baby, when it\'s love, if it\'s not rough, it isn\'t fun, you know me\n> (I\'ll get him hot, show him what I\'ve got)\n> Can\'t read my, can\'t read my\n> No, he can\'t read my poker face\n> (She\'s got me like nobody)\n> Can\'t read my, can\'t read my\n> No, he can\'t read my poker face\n> (He\'s got me like nobody)\n> P-p-p-poker face, p-p-poker face\n> I wanna roll with him a hard pair we will be\n> A little gambling is fun when you\'re with me, I love it\n> (Russian Roulette is not the same without a gun)\n> R-r-roulette with my heart, r-r-roulette with my heart\n> I don\'t wanna talk anymore\n> I don\'t wanna kiss anymore\n> I don\'t wanna touch anymore\n> I do

In [23]:
query2 = "which lady gaga song contain the phrase: I dont wanna talk anymore "
augmented_prompt, source_knowledge = augment_prompt(query2, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
response.text

'The Lady Gaga song that contains the phrase "I don\'t wanna talk anymore" is "Telephone."'

In [24]:
print(source_knowledge)

Police 1: I told you she didn’t have a d
Police 2: Too bad
Speaker: Lady Gaga, you got a call
Speaker: Beyoncé on the line for Gaga
Hello, hello, baby, you called? I cant hear a thing
I have got no service in the club, you say, say? 
Wha-wha-what did you say, huh? Youre breaking up on me
Sorry, I cannot hear you, Im kinda busy 
K-kinda busy , k-kinda busy 
Sorry, I cannot hear you, Im kinda busy
Just a second, its my favorite song theyre gonna play 
And I cannot text you with a drink in my hand, eh
You shouldve made some plans with me, you knew that I was free
And now you wont stop calling me, Im kind of busy
Stop callin, stop callin, I dont wanna think anymore
I left my head and my heart on the dance floor
Stop callin, stop callin, I dont wanna talk anymore
I left my head and my heart on the dance floor
Eh, eh, eh, eh, eh, eh, eh, eh, eh, eh, eh
Stop telephoning me!
Eh, eh, eh, eh, eh, eh, eh, eh, eh, eh 
Im busy!
Eh, eh, eh, eh, eh, eh, eh, eh, eh, eh 
Stop telephoning me!
Eh, eh, eh

In [28]:
query3 = "when was the song Charlotte Nights by Lady Gaga released?"
co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query3,
    )
response.text

"'Charlotte Nights' by Lady Gaga was released on May 21, 2020, as part of her sixth studio album, *Chromatica*. The song is a collaboration with American singer-songwriter Charlotte Lawrence and is the thirteenth track on the album."

In [29]:
query3 = "when was the song Charlotte Nights by Lady Gaga released?"
augmented_prompt, source_knowledge = augment_prompt(query3, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
response.text

'The song "Charlotte Nights" by Lady Gaga is unreleased, so it doesn\'t have a release date.'

In [30]:
print(source_knowledge)

Charlotte Nights is a unreleased song written by Lady Gaga during The Monster Ball Tour

There are no official lyrics for this song at the moment

Gaga, Oh la-la
Gaga, Oh la-la
Ra-ra-ah-ah-ah
Roma Roma-ma
Gaga, Oh la-la
Ga-a, Oh la-la
Ra-ra-ah-ah-ah
Roma Roma-ma
Gaga, Oh la-la
Want your bad romance
I want your ugly, I want your disease
I want your everything as long as it’s free
I want your love
Love, love, love, I want your love
I want your drama, the touch of your hand
I want your leather-studded kiss in the sand
I want your love
Love, love, love, I want your love
You know that I want you
And you know that I need you
I want it bad
Your bad romance
I want your love, and I want your revenge
You and me could write a bad romance
I want your love, and I want your revenge
You and me could write a bad romance
Oh-oh-oh-oh-oh-oh-oh-oh-oh-oh-oh-oh
Caught in a bad romance
Oh-oh-oh-oh-oh-oh-oh-oh-oh-oh-oh-oh
Caught in a bad romance
I want your horror, I want your design
Cause you’re a criminal a