<a href="https://colab.research.google.com/github/RicardoPoleo/DeepLearning_FactChecker/blob/main/notebooks/Agents/ModelAgentB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Install Dependencies
!pip install sentence-transformers torch transformers datasets

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.0-py3-none-any.whl (224 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/224.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m194.6/224.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import logging

# Setup basic configuration for logging
logging.basicConfig(level=logging.INFO)

class InformationRetrievalAgent:
    def __init__(self, model_path, evidence_file, top_n=5):
        self.top_n = top_n  # Parameterize the number of top evidence pieces to retrieve

        try:
            # Load the model with an option to trust remote code which is necessary for some advanced models
            self.model = SentenceTransformer(model_path, trust_remote_code=True)
            logging.info(f"Model loaded successfully from {model_path}.")
        except Exception as e:
            logging.error(f"Failed to load the model from {model_path}: {e}")
            raise

        try:
            # Load the evidence from CSV file
            self.evidence_df = pd.read_csv(evidence_file)
            self.evidence_texts = self.evidence_df['evidence'].tolist()
            logging.info("Evidence data loaded successfully.")
        except Exception as e:
            logging.error(f"Failed to load evidence from {evidence_file}: {e}")
            raise

        try:
            # Encode the evidence texts
            self.evidence_embeddings = self.model.encode(self.evidence_texts, convert_to_tensor=True)
            logging.info("Evidence texts encoded successfully.")
        except Exception as e:
            logging.error("Failed to encode evidence texts: {e}")
            raise

    def retrieve_evidence(self, keywords):
        try:
            # Encode the keywords
            keywords_embedding = self.model.encode(keywords, convert_to_tensor=True)
        except Exception as e:
            logging.error(f"Failed to encode keywords: {e}")
            return []

        # Compute cosine similarities
        similarities = cos_sim(keywords_embedding, self.evidence_embeddings)

        # Get the top N most similar evidence
        top_n_indices = similarities[0].argsort(descending=True)[:self.top_n]

        # Retrieve the top N evidence texts
        top_evidence = [self.evidence_texts[idx] for idx in top_n_indices]

        return top_evidence


# Adding in this same cell just to make it easier, however, we should do it in another cell
# Using a public URL for easy access
evidence_pathfile = "https://github.com/RicardoPoleo/DeepLearning_FactChecker/raw/main/datasets/healthver_only_evidence.csv"
ir_agent = InformationRetrievalAgent(
    model_path='fine-tuned/NFCorpus-256-24-gpt-4o-2024-05-13-203779',
    evidence_file=evidence_pathfile,
    top_n=5
)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

configuration_bert.py:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/fine-tuned/NFCorpus-256-24-gpt-4o-2024-05-13-203779:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_bert.py:   0%|          | 0.00/97.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/fine-tuned/NFCorpus-256-24-gpt-4o-2024-05-13-203779:
- modeling_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/549M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [3]:
# Interacting with the agent
keywords = "Vitamin C cures COVID-19"
evidence = ir_agent.retrieve_evidence(keywords)
print("Retrieved evidence:", evidence)


Retrieved evidence: ['BCG may offer protection from COVID-19.', 'BCG may offer protection from COVID-19.', 'BCG may offer protection from COVID-19.', 'BCG may offer protection from COVID-19.', 'BCG may offer protection from COVID-19.']
