In [2]:
import os
import logging
import torch
import PyPDF2
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from rouge_score import rouge_scorer
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.llms import Replicate
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.base import VectorStoreRetriever

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
logging.info(f"Using device: {device}")

def extract_text_from_pdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

try:
    # Load the PDF file
    pdf_path = "example.pdf"
    text_data = extract_text_from_pdf(pdf_path)
    logging.info("PDF file loaded and text extracted successfully.")
except Exception as e:
    logging.error(f"Error loading PDF file: {e}")
    raise

try:
    # Split the text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    all_splits = text_splitter.split_text(text_data)
    logging.info("Text data split into chunks successfully.")
except Exception as e:
    logging.error(f"Error splitting text data: {e}")
    raise

try:
    # Initialize embeddings
    model_name = "sentence-transformers/all-mpnet-base-v2"
    model_kwargs = {'device': device}
    encode_kwargs = {'normalize_embeddings': False}
    hf = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs)
    logging.info("Embeddings initialized successfully.")
except Exception as e:
    logging.error(f"Error initializing embeddings: {e}")
    raise

try:
    # Create the vector store
    vectorstore = FAISS.from_texts(texts=all_splits, embedding=hf)
    retriever = VectorStoreRetriever(vectorstore=vectorstore)
    logging.info("Vector store created successfully.")
except Exception as e:
    logging.error(f"Error creating vector store: {e}")
    raise

# Set Replicate API token
if not replicate_api_token:
    raise ValueError("Replicate API token is not set. Please set the API token.")
os.environ["REPLICATE_API_TOKEN"] = replicate_api_token
logging.info("Replicate API token set successfully.")
logging.info(f"Replicate API token: {os.environ.get('REPLICATE_API_TOKEN')}")

try:
    # Initialize the language model
    llm = Replicate(
        model="a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5",
        input={"temperature": 0.75, "max_length": 500, "top_p": 1}
    )
    logging.info("Language model initialized successfully.")
except Exception as e:
    logging.error(f"Error initializing language model: {e}")
    raise

try:
    # Define the prompt template
    template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, try to make up an answer that sounds very professional.
Use a maximum of three sentences and keep the answer as concise as possible.
Always say "Thank you for asking" at the end of the answer, and be as professional as you can.
{context}
Question: {question}
Helpful Answer:"""

    QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

    # Create the RetrievalQA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=retriever,
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
    )
    logging.info("RetrievalQA chain created successfully.")
except Exception as e:
    logging.error(f"Error creating RetrievalQA chain: {e}")
    raise

# Function to generate answers using RAG
def generate_answer(question):
    return qa_chain.invoke(question)['result']

# Function to align tokens with padding or trimming
def align_tokens(actual, generated):
    actual_tokens = actual.split()
    generated_tokens = generated.split()
    
    max_len = max(len(actual_tokens), len(generated_tokens))
    
    # Pad or trim tokens to the same length
    if len(actual_tokens) < max_len:
        actual_tokens.extend(['<PAD>'] * (max_len - len(actual_tokens)))
    elif len(actual_tokens) > max_len:
        actual_tokens = actual_tokens[:max_len]
    
    if len(generated_tokens) < max_len:
        generated_tokens.extend(['<PAD>'] * (max_len - len(generated_tokens)))
    elif len(generated_tokens) > max_len:
        generated_tokens = generated_tokens[:max_len]
    
    return actual_tokens, generated_tokens

# Function to compute evaluation metrics
def compute_metrics(actual_answers, generated_answers):
    em_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    rouge_scores = []
    disambig_f1_scores = []

    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = RobertaForSequenceClassification.from_pretrained("roberta-base")

    for actual, generated in zip(actual_answers, generated_answers):
        # Exact Match
        em_scores.append(actual == generated)
        
        # Token-level Precision, Recall, F1
        aligned_actual, aligned_generated = align_tokens(actual, generated)
        
        precision = precision_score(aligned_actual, aligned_generated, average='micro', zero_division=1)
        recall = recall_score(aligned_actual, aligned_generated, average='micro', zero_division=1)
        f1 = f1_score(aligned_actual, aligned_generated, average='micro', zero_division=1)
        
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        
        # ROUGE Scores
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
        rouge = scorer.score(actual, generated)
        rouge_scores.append(rouge)
        
        # Disambig-F1
        inputs = tokenizer.encode_plus(actual, generated, return_tensors="pt")
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        disambig_f1 = probs[0][1].item()
        disambig_f1_scores.append(disambig_f1)
    
    metrics = {
        "Exact Match (EM)": sum(em_scores) / len(em_scores),
        "Precision": sum(precision_scores) / len(precision_scores),
        "Recall": sum(recall_scores) / len(recall_scores),
        "F1 Score": sum(f1_scores) / len(f1_scores),
        "ROUGE": rouge_scores,
        "Disambig-F1": sum(disambig_f1_scores) / len(disambig_f1_scores)
    }
    
    return metrics

# Function to test RAG with questions from CSV and compute metrics
def test_rag_with_csv(csv_file_path):
    # Load the CSV file
    df = pd.read_csv(csv_file_path)
    
    # Ensure CSV has 'Question' and 'Answer' columns
    if 'Question' not in df.columns or 'Answer' not in df.columns:
        raise ValueError("CSV file must contain 'Question' and 'Answer' columns.")
    
    # Generate answers using RAG and compare with actual answers
    actual_answers = []
    generated_answers = []
    for index, row in df.iterrows():
        question = row['Question']
        actual_answer = row['Answer']
        
        generated_answer = generate_answer(question)
        
        actual_answers.append(actual_answer)
        generated_answers.append(generated_answer)
    
    # Compute metrics
    metrics = compute_metrics(actual_answers, generated_answers)
    return metrics

# Example usage
csv_file_path = "question_answers.csv"
metrics = test_rag_with_csv(csv_file_path)
print(metrics)



2024-06-13 19:01:32,609 - INFO - Using device: cuda


2024-06-13 19:01:33,192 - INFO - PDF file loaded and text extracted successfully.
2024-06-13 19:01:33,196 - INFO - Text data split into chunks successfully.
2024-06-13 19:01:33,197 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
2024-06-13 19:01:34,010 - INFO - Embeddings initialized successfully.
2024-06-13 19:01:35,341 - INFO - Vector store created successfully.
2024-06-13 19:01:35,342 - INFO - Replicate API token set successfully.
2024-06-13 19:01:35,343 - INFO - Replicate API token: r8_5Igc3NjlWeNQeOIk5BBMfhaGp5vmzsl3LuZ3a
2024-06-13 19:01:35,344 - INFO - Language model initialized successfully.
2024-06-13 19:01:35,350 - INFO - RetrievalQA chain created successfully.
2024-06-13 19:01:35,941 - INFO - HTTP Request: GET https://api.replicate.com/v1/models/a16z-infra/llama13b-v2-chat "HTTP/1.1 200 OK"
2024-06-13 19:01:36,045 - INFO - HTTP Request: GET https://api.replicate.com/v1/models/meta/llama-2-13b-chat/versions/df7690f1994d94e96ad9d568eac121a

{'Exact Match (EM)': 0.0, 'Precision': 0.0030936582859621507, 'Recall': 0.0030936582859621507, 'F1 Score': 0.0030936582859621507, 'ROUGE': [{'rouge1': Score(precision=0.2727272727272727, recall=1.0, fmeasure=0.42857142857142855), 'rougeL': Score(precision=0.2727272727272727, recall=1.0, fmeasure=0.42857142857142855)}, {'rouge1': Score(precision=0.17647058823529413, recall=0.6363636363636364, fmeasure=0.27631578947368424), 'rougeL': Score(precision=0.1092436974789916, recall=0.3939393939393939, fmeasure=0.17105263157894737)}, {'rouge1': Score(precision=0.3508771929824561, recall=0.5, fmeasure=0.41237113402061853), 'rougeL': Score(precision=0.3157894736842105, recall=0.45, fmeasure=0.37113402061855666)}, {'rouge1': Score(precision=0.2159090909090909, recall=0.48717948717948717, fmeasure=0.2992125984251969), 'rougeL': Score(precision=0.13636363636363635, recall=0.3076923076923077, fmeasure=0.1889763779527559)}, {'rouge1': Score(precision=0.1308411214953271, recall=0.6666666666666666, fmea