In [12]:
import PyPDF2
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
from rouge_score import rouge_scorer
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub

# Function to load PDF using PdfReader
def load_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Load the PDF
pdf_text = load_pdf('example.pdf')

# Convert the loaded text to Document objects
documents = [Document(page_content=pdf_text, metadata={"source": "PDF"})]

# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

# Embed the documents
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


In [16]:
# Function to generate answers using RAG
def generate_answer(question):
    return rag_chain.invoke(question)

# Function to align tokens with padding or trimming
def align_tokens(actual, generated):
    actual_tokens = actual.split()
    generated_tokens = generated.split()
    
    max_len = max(len(actual_tokens), len(generated_tokens))
    
    # Pad or trim tokens to the same length
    if len(actual_tokens) < max_len:
        actual_tokens.extend(['<PAD>'] * (max_len - len(actual_tokens)))
    elif len(actual_tokens) > max_len:
        actual_tokens = actual_tokens[:max_len]
    
    if len(generated_tokens) < max_len:
        generated_tokens.extend(['<PAD>'] * (max_len - len(generated_tokens)))
    elif len(generated_tokens) > max_len:
        generated_tokens = generated_tokens[:max_len]
    
    return actual_tokens, generated_tokens

# Function to compute evaluation metrics
def compute_metrics(actual_answers, generated_answers):
    em_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    rouge_scores = []
    disambig_f1_scores = []

    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = RobertaForSequenceClassification.from_pretrained("roberta-base")

    for actual, generated in zip(actual_answers, generated_answers):
        # Exact Match
        em_scores.append(actual == generated)
        
        # Token-level Precision, Recall, F1
        aligned_actual, aligned_generated = align_tokens(actual, generated)
        
        precision = precision_score(aligned_actual, aligned_generated, average='micro', zero_division=1)
        recall = recall_score(aligned_actual, aligned_generated, average='micro', zero_division=1)
        f1 = f1_score(aligned_actual, aligned_generated, average='micro', zero_division=1)
        
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        
        # ROUGE Scores
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
        rouge = scorer.score(actual, generated)
        rouge_scores.append(rouge)
        
        # Disambig-F1
        inputs = tokenizer.encode_plus(actual, generated, return_tensors="pt")
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        disambig_f1 = probs[0][1].item()
        disambig_f1_scores.append(disambig_f1)
    
    metrics = {
        "Exact Match (EM)": sum(em_scores) / len(em_scores),
        "Precision": sum(precision_scores) / len(precision_scores),
        "Recall": sum(recall_scores) / len(recall_scores),
        "F1 Score": sum(f1_scores) / len(f1_scores),
        "ROUGE": rouge_scores,
        "Disambig-F1": sum(disambig_f1_scores) / len(disambig_f1_scores)
    }
    
    return metrics

# Function to test RAG with questions from CSV and compute metrics
def test_rag_with_csv(csv_file_path):
    # Load the CSV file
    df = pd.read_csv(csv_file_path)
    
    # Ensure CSV has 'Question' and 'Answer' columns
    if 'Question' not in df.columns or 'Answer' not in df.columns:
        raise ValueError("CSV file must contain 'Question' and 'Answer' columns.")
    
    # Generate answers using RAG and compare with actual answers
    actual_answers = []
    generated_answers = []
    for index, row in df.iterrows():
        question = row['Question']
        actual_answer = row['Answer']
        
        generated_answer = generate_answer(question)
        
        actual_answers.append(actual_answer)
        generated_answers.append(generated_answer)
    
    # Compute metrics
    metrics = compute_metrics(actual_answers, generated_answers)
    return metrics



In [17]:

# Test the RAG model with questions and answers from the CSV file
csv_file_path = 'question_answers.csv'  # Replace with your CSV file path
metrics = test_rag_with_csv(csv_file_path)

# Display the metrics
print(metrics)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'Exact Match (EM)': 0.0, 'Precision': 0.02884253359436745, 'Recall': 0.02884253359436745, 'F1 Score': 0.02884253359436745, 'ROUGE': [{'rouge1': Score(precision=0.3125, recall=0.18518518518518517, fmeasure=0.2325581395348837), 'rougeL': Score(precision=0.25, recall=0.14814814814814814, fmeasure=0.18604651162790697)}, {'rouge1': Score(precision=0.25, recall=0.5454545454545454, fmeasure=0.34285714285714286), 'rougeL': Score(precision=0.1527777777777778, recall=0.3333333333333333, fmeasure=0.20952380952380956)}, {'rouge1': Score(precision=0.2597402597402597, recall=0.5, fmeasure=0.34188034188034183), 'rougeL': Score(precision=0.15584415584415584, recall=0.3, fmeasure=0.20512820512820512)}, {'rouge1': Score(precision=0.22950819672131148, recall=0.358974358974359, fmeasure=0.27999999999999997), 'rougeL': Score(precision=0.11475409836065574, recall=0.1794871794871795, fmeasure=0.13999999999999999)}, {'rouge1': Score(precision=0.4666666666666667, recall=0.6666666666666666, fmeasure=0.54901960