In [None]:
!pip install lancedb sentence-transformers openai PyPDF2 nltk transformers seaborn pandas matplotlib numpy renumics-spotlight


In [None]:
import lancedb
from lancedb.pydantic import LanceModel, Vector
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import nltk
from nltk.tokenize import sent_tokenize

PATH_TO_SOURCE_PDF = "PUT YOUR PATH TO YOUR PDF HERE"

# Initialize SentenceTransformer model
model = SentenceTransformer('BAAI/bge-m3').to("mps")

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to split text into chunks of approximately 10 sentences
def create_chunks(text, sentences_per_chunk=10):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    
    for sentence in sentences:
        current_chunk.append(sentence)
        if len(current_chunk) >= sentences_per_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
    
    # Add any remaining sentences as the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

# Function to create embeddings for chunks
def create_chunk_embeddings(chunks):
    embeddings = model.encode(chunks, show_progress_bar=True)
    return list(zip(chunks, embeddings))

# Set up LanceDB
db = lancedb.connect("data/lancedb")

# Define the schema
class ChunkEmbedding(LanceModel):
    chunk: str
    embedding: Vector(model.get_sentence_embedding_dimension())

# Create or open the table
table_name = "masterarbeit_chunks"
table_created = False
if table_name in db.table_names():
    table = db.open_table(table_name)
else:
    table = db.create_table(table_name, schema=ChunkEmbedding)
    table_created = True

# Check if the table is empty
if table_created:
    # Process the PDF
    text = extract_text_from_pdf(PATH_TO_SOURCE_PDF)
    chunks = create_chunks(text)
    chunk_embeddings = create_chunk_embeddings(chunks)

    print(len(chunk_embeddings))

    # Insert data into the table
    data = [{"chunk": chunk, "embedding": emb.tolist()} for chunk, emb in chunk_embeddings]
    table.add(data)
else:
    print("Chunk table already exists and is not empty. Skipping chunk creation.")

In [None]:
# Instantiate the reranker
from transformers import AutoModelForSequenceClassification, AutoTokenizer

reranker_tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
reranker_model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3').to("mps")
reranker_model.eval()

In [3]:
questions_with_answers = {
    "factoid": [
        {
            "question": "Example question 1 for factoid category?",
            "answer": "Example answer 1 for factoid category."
        },
        {
            "question": "Example question 2 for factoid category?",
            "answer": "Example answer 2 for factoid category."
        }
    ],
    "paraphrased_factoid": [
        {
            "question": "Example question 1 for paraphrased factoid category?",
            "answer": "Example answer 1 for paraphrased factoid category."
        },
        {
            "question": "Example question 2 for paraphrased factoid category?",
            "answer": "Example answer 2 for paraphrased factoid category."
        }
    ],
    "multi_source_question": [
        {
            "question": "Example question 1 for multi-source question category?",
            "answer": "Example answer 1 for multi-source question category."
        },
        {
            "question": "Example question 2 for multi-source question category?",
            "answer": "Example answer 2 for multi-source question category."
        }
    ],
    "summary_table_question": [
        {
            "question": "Example question 1 for summary table question category?",
            "answer": "Example answer 1 for summary table question category."
        },
        {
            "question": "Example question 2 for summary table question category?",
            "answer": "Example answer 2 for summary table question category."
        }
    ]
}

In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import torch
import numpy as np
from typing import Literal
from pydantic import BaseModel
from openai import OpenAI
from typing import List
import seaborn as sns

# Constants for configuration
INITIAL_RESULTS = 50  # Number of initially retrieved results
TOP_RESULTS = 10  # Number of top results to display
PLOT_RESULTS = True  # Toggle for plotting

# Initialize OpenAI client
client = OpenAI()

class ContextEvaluation(BaseModel):
    # Information Content Changes
    added_relevant_info: bool    
    lost_relevant_info: bool     
    improved_coverage: bool      
    
    # Ranking Changes
    better_top_position: bool    
    worse_top_position: bool     
    
    # Quality Changes
    reduced_redundancy: bool     
    more_noise: bool            
    
    explanation: str            

def evaluate_contexts(query: str, answer: str, original_contexts: List[str], reranked_contexts: List[str]) -> ContextEvaluation:
    prompt = f"""
    Question: {query}

    Ground Truth Answer: {answer}

    Original Top 10 Results:
    {'-' * 80}
    {' '.join(original_contexts)}
    
    Reranked Top 10 Results:
    {'-' * 80}
    {' '.join(reranked_contexts)}

    Compare these contexts and evaluate the following aspects (true/false):

    1. Information Changes:
    - added_relevant_info: Did new relevant information appear in the reranked context?
    - lost_relevant_info: Was relevant information lost from the original context?
    - improved_coverage: Does the reranked context cover more aspects of the answer?

    2. Position Changes:
    - better_top_position: Did the most relevant information move to earlier positions?
    - worse_top_position: Did the most relevant information move to later positions?

    3. Quality Changes:
    - reduced_redundancy: Was repetitive information reduced?
    - more_noise: Was more irrelevant information introduced?

    Provide a brief explanation of your evaluation.

    You must respond with valid JSON matching this schema:
    {{
        "type": "object",
        "properties": {{
            "added_relevant_info": {{"type": "boolean"}},
            "lost_relevant_info": {{"type": "boolean"}},
            "improved_coverage": {{"type": "boolean"}},
            "better_top_position": {{"type": "boolean"}},
            "worse_top_position": {{"type": "boolean"}},
            "reduced_redundancy": {{"type": "boolean"}},
            "more_noise": {{"type": "boolean"}},
            "explanation": {{"type": "string"}}
        }},
        "required": ["added_relevant_info", "lost_relevant_info", "improved_coverage", 
                    "better_top_position", "worse_top_position", "reduced_redundancy", 
                    "more_noise", "explanation"]
    }}
    """

    response = client.beta.chat.completions.parse(
        model="gpt-4o",
        response_format=ContextEvaluation,
        messages=[
            {
                "role": "system", 
                "content": "You are a helpful assistant that evaluates text relevance. Always provide responses in the specified JSON format."
            },
            {"role": "user", "content": prompt}
        ]
    )
    
    return response.choices[0].message.parsed

def calculate_effectiveness_score(eval: ContextEvaluation) -> float:
    return sum([
        eval.added_relevant_info * 1.0,
        -eval.lost_relevant_info * 1.0,
        eval.improved_coverage * 0.8,
        eval.better_top_position * 0.6,
        -eval.worse_top_position * 0.6,
        eval.reduced_redundancy * 0.4,
        -eval.more_noise * 0.4
    ])

# Initialize a list to accumulate all question data
all_questions_data = []

def visualize_ranking(query, category):
    print(f"Category: {category}")
    print(f"Question: {query}")
    print()

    # Perform the query
    query_embedding = model.encode([query])[0]
    results = table.search(query_embedding).limit(INITIAL_RESULTS).to_pandas()

    # Add original rank
    results['original_rank'] = range(1, len(results) + 1)

    # Prepare pairs for reranking
    pairs = [[query, row['chunk']] for _, row in results.iterrows()]

    # Rerank the results
    with torch.no_grad():
        inputs = reranker_tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512).to("mps")
        scores = reranker_model(**inputs, return_dict=True).logits.view(-1,).float()

    # Add scores to the results DataFrame
    results['rerank_score'] = scores.tolist()

    # Sort results by rerank score and add new rank
    reranked_results = results.sort_values('rerank_score', ascending=False).reset_index(drop=True)
    reranked_results['reranked_rank'] = range(1, len(reranked_results) + 1)

    # Get top contexts from both rankings
    original_top = results.head(TOP_RESULTS)['chunk'].tolist()
    reranked_top = reranked_results.head(TOP_RESULTS)['chunk'].tolist()
    
    # Find new contexts that were added to top results
    added_contexts = [chunk for chunk in reranked_top if chunk not in original_top]
    added_contexts_with_ranks = [
        {
            'chunk': chunk,
            'new_rank': idx + 1,
            'original_rank': results[results['chunk'] == chunk]['original_rank'].iloc[0]
        }
        for idx, chunk in enumerate(reranked_top) 
        if chunk not in original_top
    ]

    answer = next(item['answer'] for item in questions_with_answers[category] if item['question'] == query)
    # Compare contexts using GPT-4
    evaluation = evaluate_contexts(query, answer, original_top, reranked_top)
    effectiveness_score = calculate_effectiveness_score(evaluation)

    # Calculate standard metrics
    rank_changes = reranked_results['original_rank'] - reranked_results['reranked_rank']
    total_rank_change = np.sum(np.abs(rank_changes))
    average_rank_change = np.mean(np.abs(rank_changes))
    max_rank_change = np.max(np.abs(rank_changes))
    
    # Calculate neglected results metric
    top_reranked = reranked_results[reranked_results['reranked_rank'] <= TOP_RESULTS]
    neglected_results = np.sum(top_reranked['original_rank'] > TOP_RESULTS)

    # Append data for the current question
    question_data = {
        "category": category,
        "question": query,
        "total_rank_change": total_rank_change,
        "average_rank_change": average_rank_change,
        "max_rank_change": max_rank_change,
        "num_results_changed": np.sum(rank_changes != 0),
        "neglected_results": neglected_results,
        "added_relevant_info": evaluation.added_relevant_info,
        "lost_relevant_info": evaluation.lost_relevant_info,
        "improved_coverage": evaluation.improved_coverage,
        "better_top_position": evaluation.better_top_position,
        "worse_top_position": evaluation.worse_top_position,
        "reduced_redundancy": evaluation.reduced_redundancy,
        "more_noise": evaluation.more_noise,
        "effectiveness_score": effectiveness_score,
        "explanation": evaluation.explanation,
        "added_contexts": added_contexts_with_ranks  # New field with detailed context information
    }
    all_questions_data.append(question_data)

    if PLOT_RESULTS:
        # After reranking but before printing results, add this visualization:
        plt.figure(figsize=(8, 6))  # Reduced figure width
        
        # Get top N results after reranking
        top_n = 10
        top_reranked = reranked_results.head(top_n)
        
        # Create non-linear scale for ranks beyond top 10
        def transform_rank(rank):
            if rank <= 10:
                return rank
            else:
                # Compress higher ranks logarithmically
                return 10 + (rank - 10) / 4
        
        # Plot lines connecting original and new ranks with transformed scale
        for _, row in top_reranked.iterrows():
            orig_rank_transformed = transform_rank(row['original_rank'])
            new_rank_transformed = transform_rank(row['reranked_rank'])
            plt.plot([1, 2], [orig_rank_transformed, new_rank_transformed], '-o', 
                    alpha=0.5, linewidth=1.5)
        
        # Customize the plot
        plt.xlim(0.8, 2.2)  # Reduced padding on sides
        max_y = transform_rank(INITIAL_RESULTS)
        plt.ylim(max_y + 1, -1)  # Reverse y-axis and set limit
        
        # Create custom y-ticks for better readability
        y_ticks = list(range(1, 11)) + [20, 30, 40, 50]
        y_ticks_transformed = [transform_rank(y) for y in y_ticks]
        plt.yticks(y_ticks_transformed, y_ticks)
        
        plt.xticks([1, 2], ['Original Rank', 'Reranked'])
        plt.ylabel('Rank')
        plt.title(f'Rank Changes for Top {top_n} Results\n{category}: {query}')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()

    # Print results
    print("Context Evaluation:")
    print(f"Effectiveness Score: {effectiveness_score:.2f}")
    print("\nEvaluation Details:")
    for key, value in evaluation.dict().items():
        if key != 'explanation':
            print(f"- {key}: {value}")
    print(f"\nExplanation: {evaluation.explanation}")
    print()

    # Print information about added contexts
    if added_contexts_with_ranks:
        print("\nNew Contexts Added to Top Results:")
        for ctx in added_contexts_with_ranks:
            print(f"- Moved from rank {ctx['original_rank']} to {ctx['new_rank']}:")
            print(f"  {ctx['chunk'][:200]}...")  # Print first 200 chars of each chunk
    
    print("\nAggregate Information:")
    print(f"Total absolute rank change: {total_rank_change}")
    print(f"Average absolute rank change: {average_rank_change:.2f}")
    print(f"Maximum rank change: {max_rank_change}")
    print(f"Number of results that changed rank: {np.sum(rank_changes != 0)}")
    print(f"Number of results moved into top {TOP_RESULTS}: {neglected_results}")
    print()

# Iterate through all questions and generate visualizations
for category, questions in questions_with_answers.items():
    for item in questions:
        visualize_ranking(item['question'], category)
        print("\n" + "="*80 + "\n")  # Separator between questions

# Create a DataFrame for all questions and save to a CSV file
all_questions_df = pd.DataFrame(all_questions_data)
all_questions_df.to_csv("all_questions_ranking_with_comparison.csv", index=False)

# Modified summary statistics
summary_stats = all_questions_df.groupby('category').agg({
    'neglected_results': ['mean', 'std'],
    'average_rank_change': ['mean', 'std'],
    'effectiveness_score': ['mean', 'std'],
    'added_relevant_info': 'mean',
    'lost_relevant_info': 'mean',
    'improved_coverage': 'mean',
    'better_top_position': 'mean',
    'worse_top_position': 'mean',
    'reduced_redundancy': 'mean',
    'more_noise': 'mean'
}).round(2)

print("\nSummary Statistics by Category:")
print(summary_stats)


# Add visualization of evaluation metrics
plt.figure(figsize=(15, 6))
evaluation_metrics = ['added_relevant_info', 'lost_relevant_info', 'improved_coverage',
                     'better_top_position', 'worse_top_position', 'reduced_redundancy', 'more_noise']
metric_means = all_questions_df[evaluation_metrics].mean()

sns.barplot(x=metric_means.index, y=metric_means.values)
plt.xticks(rotation=45, ha='right')
plt.title('Average Evaluation Metrics Across All Questions')
plt.tight_layout()
plt.show()

# Distribution of effectiveness scores by category
plt.figure(figsize=(12, 6))
sns.boxplot(data=all_questions_df, x='category', y='effectiveness_score')
plt.xticks(rotation=45, ha='right')
plt.title('Distribution of Effectiveness Scores by Category')
plt.tight_layout()
plt.show()

In [None]:
from renumics import spotlight

spotlight.show(all_questions_df)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create separate plots for each metric
metrics = ['neglected_results', 'average_rank_change', 'total_rank_change']
metric_titles = ['Neglected Results', 'Average Rank Change', 'Total Rank Change']

for metric, title in zip(metrics, metric_titles):
    plt.figure(figsize=(15, 5))
    
    # Get unique categories and determine axis limits
    categories = all_questions_df['category'].unique()
    num_categories = len(categories)
    x_min = all_questions_df[metric].min()
    x_max = all_questions_df[metric].max()
    y_max = 0
    
    # First pass to determine maximum y value
    for category in categories:
        hist, _ = np.histogram(all_questions_df[all_questions_df['category'] == category][metric], bins=5)
        y_max = max(y_max, hist.max())
    
    # Create subplots for each category
    for idx, category in enumerate(categories, 1):
        plt.subplot(1, num_categories, idx)
        sns.histplot(data=all_questions_df[all_questions_df['category'] == category],
                    x=metric,
                    bins=5)
        plt.title(f'{category}')
        plt.xlabel(title)
        plt.ylabel('Count')
        
        # Set consistent axis limits
        plt.xlim(x_min, x_max)
        plt.ylim(0, y_max + 1)  # Add 1 for some padding
    
    plt.suptitle(f'Distribution of {title} by Category', y=1.05)
    plt.tight_layout()
    plt.show()

# Calculate and display summary statistics by category
summary_stats = all_questions_df.groupby('category').agg({
    'neglected_results': ['mean', 'std', 'min', 'max'],
    'average_rank_change': ['mean', 'std', 'min', 'max'],
    'total_rank_change': ['mean', 'std', 'min', 'max']
}).round(2)

print("\nSummary Statistics by Category:")
print(summary_stats)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming all_questions_df is your DataFrame with columns:
# 'category', 'average_rank_change', 'neglected_results'

# Calculate means for each category and metric
means = all_questions_df.groupby('category').agg({
    'average_rank_change': 'mean',
    'neglected_results': 'mean'
}).reset_index()

# Create figure
plt.figure(figsize=(12, 6))

# Set bar width and positions
bar_width = 0.35
x = np.arange(len(means['category']))

# Create bars
plt.bar(x - bar_width/2, means['average_rank_change'], bar_width, 
        label='Average Rank Change', color='skyblue')
plt.bar(x + bar_width/2, means['neglected_results'], bar_width,
        label='Neglected Results', color='lightcoral')

# Customize plot
plt.xlabel('Question Category')
plt.ylabel('Value')
plt.title('Average Rank Change and Neglected Results by Category')
plt.xticks(x, means['category'], rotation=45, ha='right')
plt.legend()

# Add value labels on top of bars
for i in x:
    plt.text(i - bar_width/2, means['average_rank_change'].iloc[i], 
             f'{means["average_rank_change"].iloc[i]:.1f}', 
             ha='center', va='bottom')
    plt.text(i + bar_width/2, means['neglected_results'].iloc[i], 
             f'{means["neglected_results"].iloc[i]:.1f}', 
             ha='center', va='bottom')

plt.tight_layout()
plt.show()