In [48]:
!pip install transformers
!pip install sentence_transformers
!pip install datasets
!pip install pinecone
!pip install cohere



In [49]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
import pandas as pd
from IPython.display import display
warnings.filterwarnings("ignore")

In [50]:
COHERE_API_KEY = 'AQitAcHfB3gWDRBwJDHzAJqebCCr9rDPOPyPYG4t'
PINECONE_API_KEY = '2a4e20ea-ebca-4265-8189-ea1e2593a873'

# 1. LLM

In [51]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

# 2 - Document reading and preprocessing

In [52]:
def load_convert_embedd_data(csv_name, model, rec_num="all"):
  """
  convert dataframe with integers columns into string sentence
  (each row of integers create two sentences)
  Args:
    csv_name - name of csv file
    model - embedding model
    rec_num - number of rows to iterate in the dataset
  """

  df = pd.read_csv(csv_name + '.csv')
  converted_data = []
  for i in range(len(df)):
    team = df['Team'].iloc[i]
    played = df['M.'].iloc[i]
    wins = df['W'].iloc[i]
    draws = df['D'].iloc[i]
    losses = df['L'].iloc[i]
    goal_diff = df['Dif'].iloc[i]
    tot_points = df['Pt.'].iloc[i]

    string_1 = f"{team} played {played} games in total in the champoins league. including {wins} wins, {draws} draws and {losses} losses."
    string_2 = f"{team} has a goal difference of {goal_diff} and they collected {tot_points} points in total"
    converted_data.append(string_1)
    converted_data.append(string_2)

  dataset = pd.DataFrame({'text': converted_data})

  if rec_num == "all":
    rec_num = len(dataset)
  embeddings = model.encode(dataset['text'][:rec_num])

  return dataset, embeddings

In [53]:
dataset, embeddings = load_convert_embedd_data("UCL_AllTime_Performance_Table",
                                               model=model)
shape = embeddings.shape

In [54]:
dataset.head(5)

Unnamed: 0,text
0,Real Madrid played 486 games in total in the c...
1,Real Madrid has a goal difference of 533 and t...
2,Bayern Munich played 388 games in total in the...
3,Bayern Munich has a goal difference of 427 and...
4,FC Barcelona played 341 games in total in the ...


# 3 - Embedding generation and insertion into Pinecone VectorDB

In [55]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [56]:
INDEX_NAME = 'ucl'
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


In [57]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'text',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


In [58]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Upserting the embeddings to the Pinecone index...


100%|██████████| 6/6 [00:05<00:00,  1.06it/s]


In [59]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 708}},
 'total_vector_count': 708}

# 3. RAG

In [60]:
query_1 = "How many draws in total does PSV Eindhoven has in the Champions League?"
query_2 = "Did Maccabi Haifa won more games than lost in the UCL?"
query_3 = "What is the goal difference of Arsenal in their entire UCL history?"

queris = [query_1, query_2, query_3]

### 3.1 - classis (no source knowledge)

In [61]:
import cohere

def query_classis_LLM(query):
  co = cohere.Client(api_key=COHERE_API_KEY)
  response = co.chat(
          model='command-r-plus',
          message=query,
      )
  answer = response.text
  return answer

### 3.2 - improved LLM

In [62]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=8,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['text'] for match in query_results]  # text

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [63]:
co = cohere.Client(api_key=COHERE_API_KEY)

def query_improved_LLM(query):
  augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)  # Retrieval of relevant documents
  response = co.chat(
          model='command-r-plus',
          message=augmented_prompt,
      )
  answer = response.text  # Generating answers
  return answer


comparison

In [64]:
for i, query in enumerate(queris):
  print(f"query {i}: ")
  print(f"\t{query}")
  print("classic LLM answer:")
  print(f"\t{query_classis_LLM(query)}")
  print("improved LLM answer:")
  print(f"\t{query_improved_LLM(query)}")
  print("="*50)

query 0: 
	How many draws in total does PSV Eindhoven has in the Champions League?
classic LLM answer:
	31
improved LLM answer:
	PSV Eindhoven has a total of 43 draws in the Champions League.
query 1: 
	Did Maccabi Haifa won more games than lost in the UCL?
classic LLM answer:
	Yes, Maccabi Haifa has won more games than they have lost in the UEFA Champions League (UCL). In the 2023/24 season, they participated in the group stage of the UCL for the first time since the 2009/10 season. They finished third in their group, winning three games, drawing one, and losing two. Overall, in their UCL history, Maccabi Haifa has played a total of 34 games, winning 12, drawing 6, and losing 16.
improved LLM answer:
	No, Maccabi Haifa won 3 games and lost 14, so they lost more games than they won.
query 2: 
	What is the goal difference of Arsenal in their entire UCL history?
classic LLM answer:
	The goal difference for Arsenal in their entire UEFA Champions League (UCL) history up until the 2022-2023