In [17]:
import PyPDF2
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
from rouge_score import rouge_scorer
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub

# Function to load PDF using PdfReader
def load_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Load the PDF
pdf_text = load_pdf('example.pdf')

# Convert the loaded text to Document objects
documents = [Document(page_content=pdf_text, metadata={"source": "PDF"})]

# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

# Embed the documents
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Simulated FLARE mechanism
def flare_retrieve(question):
    initial_result = retriever.vectorstore.similarity_search(question)
    
    # Here, we assume a simplistic confidence mechanism where we re-query if the initial response is short or lacking context.
    if len(initial_result[0].page_content) < 100:  # This threshold can be adjusted
        follow_up_query = f"Can you provide more details about: {question}"
        detailed_query = llm(follow_up_query, context=initial_result)
        return retriever.vectorstore.similarity_search(detailed_query)
    return initial_result

# Function to generate answers using RAG with FLARE mechanism
def generate_answer(question):
    results = flare_retrieve(question)
    return " ".join([doc.page_content for doc in results])

# Function to align tokens with padding or trimming
def align_tokens(actual, generated):
    actual_tokens = actual.split()
    generated_tokens = generated.split()
    
    max_len = max(len(actual_tokens), len(generated_tokens))
    
    # Pad or trim tokens to the same length
    if len(actual_tokens) < max_len:
        actual_tokens.extend(['<PAD>'] * (max_len - len(actual_tokens)))
    elif len(actual_tokens) > max_len:
        actual_tokens = actual_tokens[:max_len]
    
    if len(generated_tokens) < max_len:
        generated_tokens.extend(['<PAD>'] * (max_len - len(generated_tokens)))
    elif len(generated_tokens) > max_len:
        generated_tokens = generated_tokens[:max_len]
    
    return actual_tokens, generated_tokens

# Function to compute evaluation metrics
def compute_metrics(actual_answers, generated_answers):
    em_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    rouge_scores = []
    disambig_f1_scores = []

    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = RobertaForSequenceClassification.from_pretrained("roberta-base")

    for actual, generated in zip(actual_answers, generated_answers):
        # Exact Match
        em_scores.append(actual == generated)
        
        # Token-level Precision, Recall, F1
        aligned_actual, aligned_generated = align_tokens(actual, generated)
        
        precision = precision_score(aligned_actual, aligned_generated, average='micro', zero_division=1)
        recall = recall_score(aligned_actual, aligned_generated, average='micro', zero_division=1)
        f1 = f1_score(aligned_actual, aligned_generated, average='micro', zero_division=1)
        
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        
        # ROUGE Scores
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
        rouge = scorer.score(actual, generated)
        rouge_scores.append(rouge)
        
        # Disambig-F1
        inputs = tokenizer.encode_plus(actual, generated, truncation=True, padding='max_length', max_length=512, return_tensors="pt")
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        disambig_f1 = probs[0][1].item()
        disambig_f1_scores.append(disambig_f1)
    
    metrics = {
        "Exact Match (EM)": sum(em_scores) / len(em_scores),
        "Precision": sum(precision_scores) / len(precision_scores),
        "Recall": sum(recall_scores) / len(recall_scores),
        "F1 Score": sum(f1_scores) / len(f1_scores),
        "ROUGE": rouge_scores,
        "Disambig-F1": sum(disambig_f1_scores) / len(disambig_f1_scores)
    }
    
    return metrics

# Function to test RAG with questions from CSV and compute metrics
def test_rag_with_csv(csv_file_path):
    # Load the CSV file
    df = pd.read_csv(csv_file_path)
    
    # Ensure CSV has 'Question' and 'Answer' columns
    if 'Question' not in df.columns or 'Answer' not in df.columns:
        raise ValueError("CSV file must contain 'Question' and 'Answer' columns.")
    
    # Generate answers using RAG and compare with actual answers
    actual_answers = []
    generated_answers = []
    for index, row in df.iterrows():
        question = row['Question']
        actual_answer = row['Answer']
        
        generated_answer = generate_answer(question)
        
        actual_answers.append(actual_answer)
        generated_answers.append(generated_answer)
    
    # Compute metrics
    metrics = compute_metrics(actual_answers, generated_answers)
    return metrics

csv_file_path = "question_answers.csv"
metrics = test_rag_with_csv(csv_file_path)
print(metrics)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty 

{'Exact Match (EM)': 0.0, 'Precision': 0.0006016544280409443, 'Recall': 0.0006016544280409443, 'F1 Score': 0.0006016544280409443, 'ROUGE': [{'rouge1': Score(precision=0.024844720496894408, recall=0.5925925925925926, fmeasure=0.04769001490312965), 'rougeL': Score(precision=0.023291925465838508, recall=0.5555555555555556, fmeasure=0.044709388971684055)}, {'rouge1': Score(precision=0.026836158192090395, recall=0.5757575757575758, fmeasure=0.05128205128205128), 'rougeL': Score(precision=0.022598870056497175, recall=0.48484848484848486, fmeasure=0.043184885290148446)}, {'rouge1': Score(precision=0.04239766081871345, recall=0.725, fmeasure=0.08011049723756906), 'rougeL': Score(precision=0.03654970760233918, recall=0.625, fmeasure=0.06906077348066297)}, {'rouge1': Score(precision=0.029239766081871343, recall=0.5128205128205128, fmeasure=0.055325034578146616), 'rougeL': Score(precision=0.02631578947368421, recall=0.46153846153846156, fmeasure=0.049792531120331954)}, {'rouge1': Score(precision=