<a href="https://colab.research.google.com/github/RicardoPoleo/DeepLearning_FactChecker/blob/main/notebooks/Agents/WebService_Agent_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Install Dependencies
!pip install sentence-transformers torch transformers datasets fastapi uvicorn
!npm install -g localtunnel

In [None]:
#@title Define the model
import pandas as pd
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import logging

# Setup basic configuration for logging
logging.basicConfig(level=logging.INFO)

class InformationRetrievalAgent:
    def __init__(self, model_path, evidence_file, top_n=5):
        self.top_n = top_n  # Parameterize the number of top evidence pieces to retrieve

        try:
            # Load the model with an option to trust remote code which is necessary for some advanced models
            self.model = SentenceTransformer(model_path, trust_remote_code=True)
            logging.info(f"Model loaded successfully from {model_path}.")
        except Exception as e:
            logging.error(f"Failed to load the model from {model_path}: {e}")
            raise

        try:
            # Load the evidence from CSV file
            self.evidence_df = pd.read_csv(evidence_file)
            self.evidence_texts = self.evidence_df['evidence'].tolist()
            logging.info("Evidence data loaded successfully.")
        except Exception as e:
            logging.error(f"Failed to load evidence from {evidence_file}: {e}")
            raise

        try:
            # Encode the evidence texts
            self.evidence_embeddings = self.model.encode(self.evidence_texts, convert_to_tensor=True)
            logging.info("Evidence texts encoded successfully.")
        except Exception as e:
            logging.error("Failed to encode evidence texts: {e}")
            raise

    def retrieve_evidence(self, keywords):
        try:
            # Encode the keywords
            keywords_embedding = self.model.encode(keywords, convert_to_tensor=True)
        except Exception as e:
            logging.error(f"Failed to encode keywords: {e}")
            return []

        # Compute cosine similarities
        similarities = cos_sim(keywords_embedding, self.evidence_embeddings)

        # Get the top N most similar evidence
        top_n_indices = similarities[0].argsort(descending=True)[:self.top_n]

        # Retrieve the top N evidence texts
        top_evidence = [self.evidence_texts[idx] for idx in top_n_indices]

        return top_evidence


# Adding in this same cell just to make it easier, however, we should do it in another cell
# Using a public URL for easy access
evidence_pathfile = "https://github.com/RicardoPoleo/DeepLearning_FactChecker/raw/main/datasets/healthver_only_evidence.csv"
ir_agent = InformationRetrievalAgent(
    model_path='fine-tuned/NFCorpus-256-24-gpt-4o-2024-05-13-203779',
    evidence_file=evidence_pathfile,
    top_n=5
)

In [None]:
#@title Start the Web Service
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn
import subprocess
import threading

app = FastAPI()

class RequestModel(BaseModel):
    text: str

@app.post("/retrieve_evidence")
def retrieve_evidence(request: RequestModel):
    keywords = request.text.split()
    evidence = ir_agent.retrieve_evidence(keywords)
    return {"evidence": evidence}

def start_uvicorn():
    uvicorn.run(app, host="0.0.0.0", port=8000)

# Free the port before starting the server
!fuser -k 8000/tcp

thread = threading.Thread(target=start_uvicorn)
thread.start()

process = subprocess.Popen(["lt", "--port", "8000"], stdout=subprocess.PIPE)
for line in process.stdout:
    print(line.decode().strip())