#**Part 3**

## First, use the installation code below if needed

In [54]:
# !pip install transformers sentence-transformers datasets cohere pinecone-client



## Insert your cohere api key, and your pinecone api key below

In [55]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

COHERE_API_KEY = "2C9pAoVJ9y73ULW4XHJSI2p5fiZXKnGhv7FGUCGz"
PINECONE_API_KEY = "5540d13e-9c29-4fde-ab7f-eb428bcdd70e"
embedding_field= "document"
dataset_name= "xsum"

## Standart Model Test

In [56]:
from datasets import load_dataset
from transformers import pipeline

dataset = load_dataset('xsum', split='train[:10]', trust_remote_code=True)  # Load a subset for testing

qa_pipeline = pipeline('question-answering')

def test_qa_model(context, question, expected_answer):
    result = qa_pipeline(question=question, context=context)
    predicted_answer = result['answer']
    score = result['score']
    is_correct = predicted_answer.strip().lower() == expected_answer.strip().lower()
    return predicted_answer, is_correct, score

context = dataset[0]['document']
questions_and_answers = [
    ("Who visited Newton Stewart to inspect the damage?", "First Minister Nicola Sturgeon"),
    ("Which town's businesses and householders were affected by the River Cree overflow?", "Newton Stewart"),
    ("What was Jeanette Tate's opinion about the multi-agency response to the flood?", "She said she could not fault the multi-agency response once the flood hit."),
    ("What specific issue did Jeanette Tate highlight regarding flood prevention?", "She said more preventative work could have been carried out to ensure the retaining wall did not fail."),
    ("What did Alex Rowley emphasize as important for flood protection plans?", "He said it was important to get the flood protection plan right but backed calls to speed up the process."),
    ("What specific location was mentioned as having commercial properties flooded?", "Victoria Street - the main shopping thoroughfare"),
    ("Which area remains under a flood alert due to constant rain?", "The Borders"),
]

failed_cases = []
for question, expected_answer in questions_and_answers:
    predicted_answer, is_correct, score = test_qa_model(context, question, expected_answer)
    if not is_correct:
        failed_cases.append({
            'context': context,
            'question': question,
            'expected_answer': expected_answer,
            'predicted_answer': predicted_answer,
            'score': score
        })

for case in failed_cases:
    print(f"Context: {case['context']}")
    print(f"Question: {case['question']}")
    print(f"Expected Answer: {case['expected_answer']}")
    print(f"Predicted Answer: {case['predicted_answer']}")
    print(f"Score: {case['score']}")
    print("\n")


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Context: The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.
Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.
However, she said more preventative work could have been carried out to ensure the retaining wall did not fail.
"It is difficult but I do think there is so much publicity for Dumfries and the Nith - and I totally appreciate that - but

## Embedding Model

In [58]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

In [59]:
def load_and_embedd_dataset(
        dataset_name: str = dataset_name,
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = embedding_field,
        rec_num: int = 400
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """
    print("Loading and embedding the dataset")

    dataset = load_dataset(dataset_name, '3.0.0', split=split)
    embeddings = model.encode(dataset[text_field][:rec_num])

    print("Done!")
    return dataset, embeddings

In [60]:
DATASET_NAME = dataset_name

dataset, embeddings = load_and_embedd_dataset(
    dataset_name=DATASET_NAME,
    rec_num=40,
    model=model,
)
shape = embeddings.shape

Loading and embedding the dataset
Done!


In [61]:
pd_dataset = dataset.to_pandas()
pd_dataset

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984
...,...,...,...
204040,The initial figure released in July was booste...,UK economic growth for the second quarter of t...,34084759
204041,"MEPs, including European Parliament chief Brex...",Theresa May's offer to give EU citizens in the...,40552318
204042,Lincoln Red Imps will bring a 1-0 lead to Glas...,Erik Sviatchenko is adamant that Celtic will p...,36781065
204043,Former Liverpool defender Mark Lawrenson expan...,People have spent a large part of this season ...,31579588


In [62]:
print(f"The embeddings shape: {embeddings.shape}")

The embeddings shape: (40, 384)


## VectorDB

In [63]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [64]:
INDEX_NAME = dataset_name

# Create the vector database
# We are passing the index_name and the size of our embeddings
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


### Adding data to the pinecone database

In [65]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = embedding_field,
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


In [66]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Upserting the embeddings to the Pinecone index...


100%|██████████| 1/1 [00:00<00:00,  1.36it/s]


In [67]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 40}},
 'total_vector_count': 40}

## LLM Test

In [68]:
import cohere

questions_and_answers = [
    ("Who visited Newton Stewart to inspect the damage?", "First Minister Nicola Sturgeon"),
    ("Which town's businesses and householders were affected by the River Cree overflow?", "Newton Stewart"),
    ("What was Jeanette Tate's opinion about the multi-agency response to the flood?", "She said she could not fault the multi-agency response once the flood hit."),
    ("What specific issue did Jeanette Tate highlight regarding flood prevention?", "She said more preventative work could have been carried out to ensure the retaining wall did not fail."),
    ("What did Alex Rowley emphasize as important for flood protection plans?", "He said it was important to get the flood protection plan right but backed calls to speed up the process."),
    ("What specific location was mentioned as having commercial properties flooded?", "Victoria Street - the main shopping thoroughfare"),
    ("Which area remains under a flood alert due to constant rain?", "The Borders"),
]
#First lets write a query for the LLM
for query, answer in questions_and_answers:
  co = cohere.Client(api_key=COHERE_API_KEY)
  response = co.chat(
          model='command-r-plus',
          message=query,
      )
  print(query)
  print(response.text)
  print("\n")

Who visited Newton Stewart to inspect the damage?
On August 17, 2017, Prince Charles visited Newton Stewart to inspect the damage caused by severe flooding in the area. The flooding had occurred a few weeks prior, on June 30, and had devastated the town, with many homes and businesses affected. The prince met with local residents, business owners, and emergency services personnel to offer his support and hear about their experiences during the flooding. He also viewed the ongoing recovery efforts and praised the resilience and community spirit of those affected. The visit brought attention and support to the town as it worked to recover and rebuild from the damage caused by the floods.


Which town's businesses and householders were affected by the River Cree overflow?
The town of Stonehaven was affected by the River Cree overflow.


What was Jeanette Tate's opinion about the multi-agency response to the flood?
Jeanette Tate, the leader of the local council, was critical of the multi-a

In [69]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['document'] for match in query_results]

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

### Pausing execution due to limit of 10 api requests per minute

In [70]:
import time
time.sleep(60)
print("1-minute break is over.")


1-minute break is over.


In [71]:
questions_and_answers = [
    ("Who visited Newton Stewart to inspect the damage?", "First Minister Nicola Sturgeon"),
    ("Which town's businesses and householders were affected by the River Cree overflow?", "Newton Stewart"),
    ("What was Jeanette Tate's opinion about the multi-agency response to the flood?", "She said she could not fault the multi-agency response once the flood hit."),
    ("What specific issue did Jeanette Tate highlight regarding flood prevention?", "She said more preventative work could have been carried out to ensure the retaining wall did not fail."),
    ("What did Alex Rowley emphasize as important for flood protection plans?", "He said it was important to get the flood protection plan right but backed calls to speed up the process."),
    ("What specific location was mentioned as having commercial properties flooded?", "Victoria Street - the main shopping thoroughfare"),
    ("Which area remains under a flood alert due to constant rain?", "The Borders"),
]
for query, expected_answer in questions_and_answers:
    augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
    response = co.chat(
            model='command-r-plus',
            message=augmented_prompt,
        )
    print(f"Question: {query}")
    print(f"Expected Answer: {expected_answer}")
    print(f"Predicted Answer: {response.text}")
    print("\n")

Question: Who visited Newton Stewart to inspect the damage?
Expected Answer: First Minister Nicola Sturgeon
Predicted Answer: First Minister Nicola Sturgeon visited Newton Stewart to inspect the damage.


Question: Which town's businesses and householders were affected by the River Cree overflow?
Expected Answer: Newton Stewart
Predicted Answer: Newton Stewart


Question: What was Jeanette Tate's opinion about the multi-agency response to the flood?
Expected Answer: She said she could not fault the multi-agency response once the flood hit.
Predicted Answer: Jeanette Tate, the owner of the Cinnamon Cafe, praised the multi-agency response to the flood. However, she believed that more could have been done to prevent the flood and ensure the retaining wall did not fail. She also felt that her area, Newton Stewart, had been neglected or forgotten in favor of Dumfries and the Nith.


Question: What specific issue did Jeanette Tate highlight regarding flood prevention?
Expected Answer: She sa

### Pausing execution due to limit of 10 api requests per minute

In [72]:
import time
time.sleep(60)
print("1-minute break is over.")

1-minute break is over.


## Comparison between standart QA model and RAG pipeline

In [73]:
# Example context from XSum dataset
# context = """
# The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
# Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
# Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
# Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
# First Minister Nicola Sturgeon visited the area to inspect the damage.
# The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.
# Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.
# However, she said more preventative work could have been carried out to ensure the retaining wall did not fail.
# "It is difficult but I do think there is so much publicity for Dumfries and the Nith - and I totally appreciate that - but it is almost like we're neglected or forgotten," she said.
# "That may not be true but it is perhaps my perspective over the last few days.
# "Why were you not ready to help us a bit more when the warning and the alarm alerts had gone out?"
# Meanwhile, a flood alert remains in place across the Borders because of the constant rain.
# Peebles was badly hit by problems, sparking calls to introduce more defences in the area.
# Scottish Borders Council has put a list on its website of the roads worst affected and drivers have been urged not to ignore closure signs.
# The Labour Party's deputy Scottish leader Alex Rowley was in Hawick on Monday to see the situation first hand.
# He said it was important to get the flood protection plan right but backed calls to speed up the process.
# "I was quite taken aback by the amount of damage that has been done," he said.
# "Obviously it is heart-breaking for people who have been forced out of their homes and the impact on businesses."
# He said it was important that "immediate steps" were taken to protect the areas most vulnerable and a clear timetable put in place for flood prevention plans.
# Have you been affected by flooding in Dumfries and Galloway or the Borders? Tell us about your experience of the situation and how it was handled. Email us on selkirk.news@bbc.co.uk or dumfries@bbc.co.uk.
# """

# Define sample questions and expected answers
questions_and_answers = [
    ("Who visited Newton Stewart to inspect the damage?", "First Minister Nicola Sturgeon"),
    ("Which town's businesses and householders were affected by the River Cree overflow?", "Newton Stewart"),
    ("What was Jeanette Tate's opinion about the multi-agency response to the flood?", "She said she could not fault the multi-agency response once the flood hit."),
    ("What specific issue did Jeanette Tate highlight regarding flood prevention?", "She said more preventative work could have been carried out to ensure the retaining wall did not fail."),
    ("What did Alex Rowley emphasize as important for flood protection plans?", "He said it was important to get the flood protection plan right but backed calls to speed up the process."),
    ("What specific location was mentioned as having commercial properties flooded?", "Victoria Street - the main shopping thoroughfare"),
    ("Which area remains under a flood alert due to constant rain?", "The Borders"),
]

# Test the QA model and the RAG pipeline on the sample questions
for question, expected_answer in questions_and_answers:
    print(f"Question: {question}")
    print(f"Expected Answer: {expected_answer}")

    # Test the QA model
    predicted_answer, is_correct, score = test_qa_model(context, question, expected_answer)
    print(f"QA Model Predicted Answer: {predicted_answer}")
    print(f"Score: {score}")

    # Test the RAG pipeline
    rag_answer = test_rag_pipeline(question, model, index)
    print(f"RAG Pipeline Answer: {rag_answer}")

    print("\n")


Question: Who visited Newton Stewart to inspect the damage?
Expected Answer: First Minister Nicola Sturgeon
QA Model Predicted Answer: Alex Rowley
Score: 0.6503791809082031
RAG Pipeline Answer: First Minister Nicola Sturgeon visited Newton Stewart to inspect the damage.


Question: Which town's businesses and householders were affected by the River Cree overflow?
Expected Answer: Newton Stewart
QA Model Predicted Answer: Newton Stewart
Score: 0.6543781757354736
RAG Pipeline Answer: Newton Stewart.


Question: What was Jeanette Tate's opinion about the multi-agency response to the flood?
Expected Answer: She said she could not fault the multi-agency response once the flood hit.
QA Model Predicted Answer: she could not fault
Score: 0.30986663699150085
RAG Pipeline Answer: Jeanette Tate, the owner of the Cinnamon Cafe, praised the multi-agency response to the flood. However, she believed that more could have been done to prevent the flood and ensure the retaining wall did not fail. She al