In [1]:
!pip install transformers
!pip install sentence_transformers
!pip install datasets
!pip install pinecone
!pip install cohere

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transform

In [2]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
import pandas as pd
from IPython.display import display
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import tqdm, trange


In [3]:
COHERE_API_KEY = 'AQitAcHfB3gWDRBwJDHzAJqebCCr9rDPOPyPYG4t'
PINECONE_API_KEY = '2a4e20ea-ebca-4265-8189-ea1e2593a873'

# 1. LLM

In [4]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# 2. VectorDB

### 2.1 - Embedding

In [21]:
def load_convert_embedd_data(csv_name, model, rec_num="all"):
  df = pd.read_csv(csv_name + '.csv')
  converted_data = []
  for i in range(len(df)):
    team = df['Team'].iloc[i]
    played = df['M.'].iloc[i]
    wins = df['W'].iloc[i]
    draws = df['D'].iloc[i]
    losses = df['L'].iloc[i]
    goal_diff = df['Dif'].iloc[i]
    tot_points = df['Pt.'].iloc[i]

    string_1 = f"{team} played {played} games in total in the champions league. including {wins} wins, {draws} draws and {losses} losses."
    string_2 = f"{team} has a goal difference of {goal_diff} and they collected {tot_points} points in total"
    converted_data.append(string_1)
    converted_data.append(string_2)

  dataset = pd.DataFrame({'text': converted_data})

  if rec_num == "all":
    rec_num = len(dataset)
  embeddings = model.encode(dataset['text'][:rec_num])

  return dataset, embeddings

In [22]:
dataset, embeddings = load_convert_embedd_data("UCL_AllTime_Performance_Table",
                                               model=model)
shape = embeddings.shape

In [78]:
dataset.head(5)

Unnamed: 0,text
0,Real Madrid played 486 games in total in the c...
1,Real Madrid has a goal difference of 533 and t...
2,Bayern Munich played 388 games in total in the...
3,Bayern Munich has a goal difference of 427 and...
4,FC Barcelona played 341 games in total in the ...


### 2.2 - creating database

In [32]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [34]:
INDEX_NAME = 'ucl'
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


In [37]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'text',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


In [38]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Upserting the embeddings to the Pinecone index...


100%|██████████| 6/6 [00:02<00:00,  2.12it/s]


In [39]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 708}},
 'total_vector_count': 708}

# 3. RAG

In [89]:
query_1 = "how many draws in total does PSV Eindhoven has in the Champoins Leage?"
query_2 = "who won more games in total in the UCL - Crvena Zvezda or Celtic FC"
query_3 = "what is the goal difference of Arsenal in their entire ucl history?"

queris = [query_1, query_2, query_3, query_4]

### 3.1 - classis (not source knowledge)

In [46]:
import cohere

def query_classis_LLM(query):
  co = cohere.Client(api_key=COHERE_API_KEY)
  response = co.chat(
          model='command-r-plus',
          message=query,
      )
  answer = response.text
  return answer


# query = "how many draws in total does PSV Eindhoven has in the Champoins Leage?"


### 3.2 - improved LLM

In [49]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    # text_matches = [match['metadata']['highlights'] for match in query_results]
    text_matches = [match['metadata']['text'] for match in query_results]  # text

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [50]:
co = cohere.Client(api_key=COHERE_API_KEY)

def query_improved_LLM(query):
  augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
  response = co.chat(
          model='command-r-plus',
          message=augmented_prompt,
      )
  answer = response.text
  return answer


comparison

In [88]:
for query in queris:
  print("query: ")
  print(f"\t{query}")
  print("classic LLM answer:")
  print(f"\t{query_classis_LLM(query)}")
  print("improved LLM answer:")
  print(f"\t{query_improved_LLM(query)}")
  print("="*50)

query: 
	how many draws in total does PSV Eindhoven has in the Champoins Leage?
classic LLM answer:
	PSV Eindhoven has drawn a total of 32 times in the Champions League as of the 2023-24 season.
improved LLM answer:
	PSV Eindhoven has 43 draws in the Champions League.
query: 
	who won more games in total in the UCL - Crvena Zvezda or Celtic FC
classic LLM answer:
	Crvena Zvezda and Celtic FC have both won the UEFA Champions League (UCL) once. However, Celtic FC has a slightly better record in terms of total games won in the UCL. As of 2023, Celtic FC has won 103 games in the UCL, compared to Crvena Zvezda's 97 wins.
improved LLM answer:
	Celtic FC won more games in total in the UCL, with 61 wins compared to Crvena Zvezda's 48 wins.
query: 
	what is the goal difference of Arsenal in their entire ucl history?
classic LLM answer:
	As of my last update on July 2, 2024, Arsenal has a negative goal difference of -2 in their entire UEFA Champions League history. They have scored 253 goals and