In [1]:
!pip install -q transformers sentence-transformers datasets cohere pinecone

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.8/173.8 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.4/214.4 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━

In [18]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
import cohere
import pinecone
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

### API Keys:

In [3]:
from google.colab import userdata
COHERE_API_KEY = userdata.get('COHERE_API_KEY')
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')

## First Element - Embedding Model

In [4]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [19]:
def load_and_embed_dataset(
        dataset_name: str,
        split: str,
        model: SentenceTransformer,
        text_field: str
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """
    print("Loading and embedding the dataset")

    # Load & embed the dataset
    dataset = load_dataset(dataset_name, split=split)
    embeddings = model.encode(dataset[text_field])

    print("Done!")
    return dataset, embeddings

In [20]:
DATASET_NAME = 'RealTimeData/bbc_news_july_2023'

dataset, embeddings = load_and_embed_dataset(
    dataset_name=DATASET_NAME,
    split='train',
    model=model,
    text_field='content'
)
shape = embeddings.shape

Loading and embedding the dataset
Done!


## Second Element - Vector Database

In [21]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str
) -> Pinecone:
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [22]:
INDEX_NAME = 'bbc-news-july-2023'

# Create the vector database
# We are passing the index_name and the size of our embeddings
pc = create_pinecone_index(index_name=INDEX_NAME, dimension=shape[1], metric="cosine")

Creating a Pinecone index...
Done!


In [24]:
def upsert_vectors(
        index: pinecone.data.index.Index,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str,
        batch_size: int
) -> pinecone.data.index.Index:
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text.encode('utf-8')[:40959].decode('utf-8', 'ignore')} for text in dataset[text_field]]

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index

In [25]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index=index, embeddings=embeddings, dataset=dataset, text_field='content', batch_size=128)

Upserting the embeddings to the Pinecone index...


100%|██████████| 16/16 [00:07<00:00,  2.04it/s]


In [26]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1928}},
 'total_vector_count': 1928}

## Third Element - LLM
We will use [Cohere's chat API](https://cohere.com/chat)

## Fourth Element - Query Function

In [30]:
def augment_prompt(
        query: str,
        index: pinecone.data.index.Index,
        model: SentenceTransformer,
        text_field: str
) -> tuple[str, str]:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=5,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [matche['metadata'][text_field] for matche in query_results]

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using only the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [39]:
# QA Model:
co = cohere.Client(api_key=COHERE_API_KEY)

In [40]:
# Queries for the LLM:
queries = ["What was the score in the Wimbledon’s 2023 final between Carlos Alcaraz and Novak Djokovic?",
           "What happened to Esther Wang?",
           "What happened to Dutch racing driver Dilano Van’t Hoff at the formula regional European championship?"]

In [41]:
def add_underline(text: str = ""):
    """ Adds underline to text when printed using `print`. """
    return f"\033[4m{text}\033[0m"

In [43]:
for i, query in enumerate(queries):
    print("\n" + add_underline(text=f"Query {i+1}") + f": {query}")
    response = co.chat(
            model='command-r-plus',
            message=query,
    )
    print(add_underline(text="Standard QA Model Answer") + ":")
    print(response.text)

    augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index, text_field='content')
    response = co.chat(
            model='command-r-plus',
            message=augmented_prompt
    )
    print(add_underline(text="RAG Pipeline Answer") + ":")
    print(response.text)


[4mQuery 1[0m: What was the score in the Wimbledon’s 2023 final between Carlos Alcaraz and Novak Djokovic?
[4mStandard QA Model Answer[0m:
The Wimbledon 2023 final between Carlos Alcaraz and Novak Djokovic was a thrilling match that ended with a score of 1-6, 7-6 (8-6), 6-1, 3-6, 6-4 in favor of Novak Djokovic. It was a hard-fought contest that showcased the skills and determination of both players. With this victory, Djokovic claimed his 22nd Grand Slam title and 7th Wimbledon championship.
[4mRAG Pipeline Answer[0m:
The score of the 2023 Wimbledon final between Carlos Alcaraz and Novak Djokovic was 1-6 7-6 (8-6) 6-1 3-6 6-4, with Alcaraz winning the match and becoming the Wimbledon men's singles champion for the first time.

[4mQuery 2[0m: What happened to Esther Wang?
[4mStandard QA Model Answer[0m:
Esther Wang, a prominent social media influencer and YouTuber, passed away on January 12, 2022, at the age of 27. Her death was confirmed by her family and friends through soc