# Start By loading API keys & define Settings

In [3]:
with open("pinecone_key.txt", mode="r") as f:
    PINECONE_KEY = f.read().strip()

with open("cohere_key.txt", mode="r") as f:
    COHERE_KEY = f.read().strip()

# Imports

In [4]:
from sentence_transformers import SentenceTransformer # type: ignore
from datasets import load_dataset # type: ignore
from pinecone import Pinecone, ServerlessSpec
import numpy as np
from tqdm import tqdm
from pprint import pprint
import cohere

2024-06-30 11:14:51.018617: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-30 11:14:51.018720: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-30 11:14:52.041302: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-30 11:14:54.660307: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
# Code is fully modular, you can set the settings you need and run the code
SETTINGS = {
    "Embedding": {
        "model": "all-MiniLM-L6-v2"
    },
    "Dataset": {
        "dataset_name": "Ateeqq/news-title-generator",
        "split": "train",
        "text_field": "text",
        "rec_num": 2_000,
    },
    "LLM": {
        "model": "command-r-plus",
    },
    "PineconeSettings": {
        "cloud": "aws",
        "region": "us-east-1"
    }
}
# SETTINGS = {
#     "Embedding": {
#         "model": "all-MiniLM-L6-v2"
#     },
#     "Dataset": {
#         "dataset_name": "neural-bridge/rag-dataset-12000",
#         "split": "train",
#         "text_field": "context",
#         "rec_num": 1_000,
#     },
#     "LLM": {
#         "model": "command-r-plus",
#     },
#     "PineconeSettings": {
#         "cloud": "aws",
#         "region": "us-east-1"
#     }
# }
pprint(SETTINGS)

{'Dataset': {'dataset_name': 'Ateeqq/news-title-generator',
             'rec_num': 2000,
             'split': 'train',
             'text_field': 'text'},
 'Embedding': {'model': 'all-MiniLM-L6-v2'},
 'LLM': {'model': 'command-r-plus'},
 'PineconeSettings': {'cloud': 'aws', 'region': 'us-east-1'}}


In [6]:
def load_and_embedd_dataset(
        dataset_name: str,
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'Excerpt',
        rec_num: int = 400

) -> tuple:
        """
        Load dataset and embed it using tranformer model
        Args:
                dataset_name: The name of the dataset
                split: (relevant to huggingface)
                model: SentenceTransformer model to embedd
                text_field: the column to embed
                rec_num: number of rows to embed
        Returns:
                full dataset and embedding of specific number of records
        """
        print("Loading and embedding the dataset")

        # Dataset is clean and short, so no need for additional preprocessing or chunking
    
        # Load the dataset
        dataset = load_dataset(dataset_name, split=split)
        # print(dataset[text_field][:rec_num])
        # Embed the first `rec_num` rows of the dataset  
        embeddings = model.encode(dataset[text_field][:rec_num])
        
        return dataset, embeddings

In [7]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(**SETTINGS["PineconeSettings"])
        )
    print("Done!")
    return pc

In [8]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'highlights',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape
    
    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]
    
    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index

In [9]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """

    field = SETTINGS["Dataset"]["text_field"]

    results = [float(val) for val in list(model.encode(query))]
    
    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata'][field] for match in query_results]
    
    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)
    
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [62]:
# The following class creates the RAG pipeline, given the SETTINGS dictionary the class will load and embed the dataset
# create a pinecone index (if it doesn't already exist), upsert the data to Pinecone
# Finally allows the end user to interact with the LLM in an easy and abstract manner
class RAG:
    def __init__(self, settings: dict) -> None:
        self.fresh_index = False
        self.model = self._load_embedding_model(settings["Embedding"])
        self.dataset, self.embeddings = self._load_and_embedd(settings["Dataset"])
        print(f'> Model and Data are loaded and data has been embedded with size {self.embeddings.shape}')
        print(f'> Initializing Index')
        self.index, self.pc = self._get_create_index(name="test") # change name 
        if self.fresh_index:
            print(f'> Index is empty upserting Data...')
            upsert_vectors(self.index, self.embeddings, self.dataset, settings["Dataset"]["text_field"])
        pprint(self.index.describe_index_stats())
        
        self.LLM_model = settings["LLM"]["model"]
        self.co = self._init_LLM()
        print(f'> LLM loaded')

        pass

    def prompt(self, query, add_context=False):
        msg = query + " be consciense and get straight the point, maximum of 3-4 lines, if you don't know say you don't know"
        source_knowledge = None
        # if add_context is true add context
        if add_context:
            msg, source_knowledge = augment_prompt(msg, self.model, self.index)

        response = self.co.chat(
            model = self.LLM_model,
            message=msg
        )
        return {"text": response.text,
                "query": msg,
                "source_knowledge": source_knowledge}

    def _init_LLM(self):
        return cohere.Client(COHERE_KEY)

    def _load_embedding_model(self, model_settings):
        return SentenceTransformer(model_settings["model"])

    def _load_and_embedd(self, dataset_settings):
        recs = dataset_settings["rec_num"]
        print(f"> Using {recs} records")
        return load_and_embedd_dataset(**dataset_settings, model=self.model)
    
    def _get_create_index(self, name):
        pc = Pinecone(PINECONE_KEY)
        if name not in [index_info["name"] for index_info in pc.list_indexes()]:
            create_pinecone_index(name, dimension=self.embeddings.shape[1])
            self.fresh_index = True
        else:
            print("> Index exists, fetching it")
        self.index_name = name
        return pc.Index(self.index_name), pc



In [63]:
# print(SETTINGS["Embedding"])
rag = RAG(settings=SETTINGS)

> Using 2000 records
Loading and embedding the dataset
> Model and Data are loaded and data has been embedded with size (2000, 384)
> Initializing Index
> Index exists, fetching it
{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2000}},
 'total_vector_count': 2000}
> LLM loaded


In [64]:
import pandas as pd
from IPython.display import display, HTML

def check_effictiveness(query):
    """
    Test the performance of the RAG model by running it once without context from the database and another time with the context
    The function prints an HTML table that summarizes the test
    Args:
        query: the query you want to test on
    """
    results = []
    results.append(rag.prompt(query, add_context=False))
    results.append(rag.prompt(query, add_context=True))
    df = pd.DataFrame(results, columns=['text','query', 'source_knowledge'])
    df.index = ['no', 'yes']
    df.index.name = 'context?'
    display(HTML(df.to_html()))


In [65]:
# NASA's Curiosity rover has shared its last selfie clicked on Mars' Vera Rubin Ridge, which was its home for over a year. A series of 57 pictures were stitched together to create the selfie. Curiosity had drilled its 19th sample at 'Rock Hall' on the ridge on December 15, 2018, which is also visible in the photo, NASA added.
res = check_effictiveness(query="Where did the Curiosity Rover drill its last drill?")
res

Unnamed: 0_level_0,text,query,source_knowledge
context?,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,"The last drilling performed by the Curiosity Rover was at a location named ""Cardiff"" on September 11, 2022. This site is located within the ""Clay-Bearing Unit,"" a region on Mount Sharp that is part of the Gale Crater on Mars.","Where did the Curiosity Rover drill its last drill? be consciense and get straight the point, maximum of 3-4 lines, if you don't know say you don't know",
yes,"Rock Hall, Vera Rubin Ridge, Mars.","Using the contexts below, answer the query.\n Contexts:\n NASA's Curiosity rover has shared its last selfie clicked on Mars' Vera Rubin Ridge, which was its home for over a year. A series of 57 pictures were stitched together to create the selfie. Curiosity had drilled its 19th sample at 'Rock Hall' on the ridge on December 15, 2018, which is also visible in the photo, NASA added.\n\nIndian Navy divers on Saturday detected the body of another miner at a depth of 280 feet inside the flooded rat-hole in Meghalaya's East Jaintia Hills district, where 15 miners were trapped on December 13. ""During their search, the Navy team also stumbled on tell-tale signs like spades, a wooden cart and then located the dead miner,"" an official said.\n\nNASA's Opportunity rover has completed 15 years on Mars after it landed on January 24, 2004, and sent its first signal to Earth after a few hours. The solar-powered rover was designed to travel about 1 km and operate for 90 Martian days (sols). However, it has travelled over 45 km and logged its 5,000th sol as of February 2018.\n If the answer is not included in the source knowledge - say that you don't know.\n Query: Where did the Curiosity Rover drill its last drill? be consciense and get straight the point, maximum of 3-4 lines, if you don't know say you don't know","NASA's Curiosity rover has shared its last selfie clicked on Mars' Vera Rubin Ridge, which was its home for over a year. A series of 57 pictures were stitched together to create the selfie. Curiosity had drilled its 19th sample at 'Rock Hall' on the ridge on December 15, 2018, which is also visible in the photo, NASA added.\n\nIndian Navy divers on Saturday detected the body of another miner at a depth of 280 feet inside the flooded rat-hole in Meghalaya's East Jaintia Hills district, where 15 miners were trapped on December 13. ""During their search, the Navy team also stumbled on tell-tale signs like spades, a wooden cart and then located the dead miner,"" an official said.\n\nNASA's Opportunity rover has completed 15 years on Mars after it landed on January 24, 2004, and sent its first signal to Earth after a few hours. The solar-powered rover was designed to travel about 1 km and operate for 90 Martian days (sols). However, it has travelled over 45 km and logged its 5,000th sol as of February 2018."


In [43]:
check_effictiveness(query="On what date will facebook discontinue Moments no need to mention year?")

Unnamed: 0,text,query,source_knowledge
0,Facebook discontinued Moments on June 25.,On what date will facebook discontinue Moments no need to mention year?,
1,February 25,"Using the contexts below, answer the query.\n Contexts:\n Facebook will discontinue its standalone private photo and video-sharing app 'Moments', launched in 2015, on February 25 later this year. Users can retrieve the content stored on the app by either storing it on Facebook or downloading it to their device via a Facebook link. Facebook attributed the discontinuation to lesser people using the app but didn't share user numbers.\n\nFacebook is reportedly testing solar-powered internet drones, to beam internet connectivity from the Earth's stratosphere, in Australia with aeronautics company Airbus. It was in talks with Airbus to conduct test flights, scheduled for November and December 2018, with Airbus' Zephyr drone, the report added. In June 2018, Facebook had closed its solar-powered aircraft-building facility in the UK.\n\nFacebook is testing 'LOL' feature, a dedicated feed consisting of meme videos and other viral content. The feature, with content categorised by topics like 'For You', 'Animals', and 'Fails' is currently in private beta with around 100 high school students. Facebook said it is still finalising if the feature will become part of the main app or a standalone app.\n If the answer is not included in the source knowledge - say that you don't know.\n Query: On what date will facebook discontinue Moments no need to mention year?","Facebook will discontinue its standalone private photo and video-sharing app 'Moments', launched in 2015, on February 25 later this year. Users can retrieve the content stored on the app by either storing it on Facebook or downloading it to their device via a Facebook link. Facebook attributed the discontinuation to lesser people using the app but didn't share user numbers.\n\nFacebook is reportedly testing solar-powered internet drones, to beam internet connectivity from the Earth's stratosphere, in Australia with aeronautics company Airbus. It was in talks with Airbus to conduct test flights, scheduled for November and December 2018, with Airbus' Zephyr drone, the report added. In June 2018, Facebook had closed its solar-powered aircraft-building facility in the UK.\n\nFacebook is testing 'LOL' feature, a dedicated feed consisting of meme videos and other viral content. The feature, with content categorised by topics like 'For You', 'Animals', and 'Fails' is currently in private beta with around 100 high school students. Facebook said it is still finalising if the feature will become part of the main app or a standalone app."


In [66]:
check_effictiveness("How much was East Coast Rail Link project planned to cost?")

Unnamed: 0_level_0,text,query,source_knowledge
context?,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,The East Coast Rail Link (ECRL) project in Malaysia was planned to cost an estimated RM55 billion (US$13.1 billion) as of its suspension in 2018.,"How much was East Coast Rail Link project planned to cost? be consciense and get straight the point, maximum of 3-4 lines, if you don't know say you don't know",
yes,The East Coast Rail Link project was planned to cost $20 billion.,"Using the contexts below, answer the query.\n Contexts:\n Replying to an Australian lawmaker's query on Twitter, Musk quoted nearly $1 billion as the cost to build a 50-km-long commuter tunnel through Australia's Blue Mountain. ""[S]o probably around $750M plus maybe $50M/station,"" Musk added without specifying if he was referring to US dollars. Last year, Musk unveiled the first underground transportation tunnel in Los Angeles by The Boring Company.\n\nMalaysia's Economic Affairs Minister on Saturday said the country will cancel its $20-billion East Coast Rail Link (ECRL) project with contractor China Communications Construction. ""The... cost to develop the ECRL is too big and we don't have [the] financial capacity,"" he added. Malaysia had in August 2018 cancelled a natural gas pipeline project which was also backed by China.\n\nIndia's bullet train service will say ""sorry"" to every passenger even if the train is delayed by only a minute, the National High Speed Rail Corporation (NHSRCL) has decided. The Ahmedabad-Mumbai bullet train will see 70 trips per day, running at 320 kms per hour. The fares are expected to be around 1.5 times the fare of AC First Class.\n If the answer is not included in the source knowledge - say that you don't know.\n Query: How much was East Coast Rail Link project planned to cost? be consciense and get straight the point, maximum of 3-4 lines, if you don't know say you don't know","Replying to an Australian lawmaker's query on Twitter, Musk quoted nearly $1 billion as the cost to build a 50-km-long commuter tunnel through Australia's Blue Mountain. ""[S]o probably around $750M plus maybe $50M/station,"" Musk added without specifying if he was referring to US dollars. Last year, Musk unveiled the first underground transportation tunnel in Los Angeles by The Boring Company.\n\nMalaysia's Economic Affairs Minister on Saturday said the country will cancel its $20-billion East Coast Rail Link (ECRL) project with contractor China Communications Construction. ""The... cost to develop the ECRL is too big and we don't have [the] financial capacity,"" he added. Malaysia had in August 2018 cancelled a natural gas pipeline project which was also backed by China.\n\nIndia's bullet train service will say ""sorry"" to every passenger even if the train is delayed by only a minute, the National High Speed Rail Corporation (NHSRCL) has decided. The Ahmedabad-Mumbai bullet train will see 70 trips per day, running at 320 kms per hour. The fares are expected to be around 1.5 times the fare of AC First Class."


In [67]:
check_effictiveness("Which video was Huwaei accused to copying for an ad?")

Unnamed: 0_level_0,text,query,source_knowledge
context?,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,"In 2018, Huawei was accused of copying a video created by a Dutch filmmaker, Maarten Paus, for one of their ads. The original video, titled ""The Story of a Boy Who Wants to Be Pictured,"" was allegedly imitated in Huawei's ""Be My Eyes"" ad campaign.","Which video was Huwaei accused to copying for an ad? be consciense and get straight the point, maximum of 3-4 lines, if you don't know say you don't know",
yes,"Huawei was accused of copying Nigel Stanford's video ""Cymatics: Science Vs. Music"" for their ad promoting the 'MediaPad M3 Lite' tablet.","Using the contexts below, answer the query.\n Contexts:\n Chinese telecoms giant Huawei has been accused of copying a music video in an ad to promote its tablet 'MediaPad M3 Lite' by New Zealand composer Nigel Stanford. Huawei's ad video, allegedly similar to Stanford's video 'Cymatics: Science Vs. Music' released in 2014, was flagged by Stanford himself. ""We're looking into this matter and will share further updates,"" Huawei said.\n\nAs part of the viral #10YearChallenge, Anil Kapoor shared a collage of his videos from various years [1989, 1999, 2009 and 2019] and wrote, ""Forget the #10YearChallenge, take the #AKChallenge!"" The video includes songs from Anil's films 'Ram Lakhan', 'Taal', a scene from 'Slumdog Millionaire' and the latest song from his upcoming film 'Ek Ladki Ko Dekha Toh Aisa Laga'.\n\nUS President Donald Trump defended high school students who appeared to confront a Native American man in a viral video. The ""students were treated unfairly with early judgements proving out to be false - smeared by media"", Trump tweeted. Other videos showed a group of black protestors hurling slurs at the students, before their encounter with the Native American man.\n If the answer is not included in the source knowledge - say that you don't know.\n Query: Which video was Huwaei accused to copying for an ad? be consciense and get straight the point, maximum of 3-4 lines, if you don't know say you don't know","Chinese telecoms giant Huawei has been accused of copying a music video in an ad to promote its tablet 'MediaPad M3 Lite' by New Zealand composer Nigel Stanford. Huawei's ad video, allegedly similar to Stanford's video 'Cymatics: Science Vs. Music' released in 2014, was flagged by Stanford himself. ""We're looking into this matter and will share further updates,"" Huawei said.\n\nAs part of the viral #10YearChallenge, Anil Kapoor shared a collage of his videos from various years [1989, 1999, 2009 and 2019] and wrote, ""Forget the #10YearChallenge, take the #AKChallenge!"" The video includes songs from Anil's films 'Ram Lakhan', 'Taal', a scene from 'Slumdog Millionaire' and the latest song from his upcoming film 'Ek Ladki Ko Dekha Toh Aisa Laga'.\n\nUS President Donald Trump defended high school students who appeared to confront a Native American man in a viral video. The ""students were treated unfairly with early judgements proving out to be false - smeared by media"", Trump tweeted. Other videos showed a group of black protestors hurling slurs at the students, before their encounter with the Native American man."


In [68]:
check_effictiveness("What was the name of the student that was killed in Australia at La Trobe University? and who were they?")

Unnamed: 0_level_0,text,query,source_knowledge
context?,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,"Zhong Cheng Luo. He was a 26-year-old Chinese national and a postgraduate student at La Trobe University in Melbourne, Australia. He was fatally stabbed on campus in what police described as a ""horrendous, horrific attack.""","What was the name of the student that was killed in Australia at La Trobe University? and who were they? be consciense and get straight the point, maximum of 3-4 lines, if you don't know say you don't know",
yes,Aiia Maasarwe was the name of the student who was killed in Australia while on a study abroad program at La Trobe University. She was an Israeli national.,"Using the contexts below, answer the query.\n Contexts:\n An Israeli student was killed by an unknown assailant in Australia while she was on video call with her sister. Aiia Maasarwe had been in Australia for about six months on a study abroad program at La Trobe University. A detective said police believed it was a random attack, however, they hadn't ruled out possibility Maasarwe had been sexually assaulted.\n\nA 21-year-old student who was pursuing LLB from Delhi University was found hanging in the room of her paying guest accommodation in Noida on Sunday night. She allegedly committed suicide after a man whom she had befriended online didn't pick her calls following an argument, the police said. Her parents haven't filed a complaint, the police added.\n\nArchaeologists have found the remains of British explorer Captain Matthew Flinders, who is credited with naming Australia, near a railway station in London. Captain Flinders led the first circumnavigation of Australia. The discovery of his burial site was made as archaeologists were preparing the site where a railway station will be built.\n If the answer is not included in the source knowledge - say that you don't know.\n Query: What was the name of the student that was killed in Australia at La Trobe University? and who were they? be consciense and get straight the point, maximum of 3-4 lines, if you don't know say you don't know","An Israeli student was killed by an unknown assailant in Australia while she was on video call with her sister. Aiia Maasarwe had been in Australia for about six months on a study abroad program at La Trobe University. A detective said police believed it was a random attack, however, they hadn't ruled out possibility Maasarwe had been sexually assaulted.\n\nA 21-year-old student who was pursuing LLB from Delhi University was found hanging in the room of her paying guest accommodation in Noida on Sunday night. She allegedly committed suicide after a man whom she had befriended online didn't pick her calls following an argument, the police said. Her parents haven't filed a complaint, the police added.\n\nArchaeologists have found the remains of British explorer Captain Matthew Flinders, who is credited with naming Australia, near a railway station in London. Captain Flinders led the first circumnavigation of Australia. The discovery of his burial site was made as archaeologists were preparing the site where a railway station will be built."
