In [9]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from typing import List, Dict, Set, Tuple
import re
from collections import Counter

In [2]:
df = pd.read_csv("/home/snp2453/DBMS/train_v3_drcat_01_llm_cleaned_noisy.csv")

In [4]:
df

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,Noisy_text,LLM_cleaned_text
0,"Dear Principal,\n\nI think Policy 1 is the way...",1,Cell phones at school,llama_70b_v1,False,"Dear Principal,\n\nI think Policy 1 is the way...","Dear Principal,\n\nI think Policy 1 is the way..."
1,\n\nTitle: Imagining Alternative Transportatio...,1,Car-free cities,Intel-neural-chat-7b-v3-1_LLMEssays_v1,True,\n\nTitle: Imagining Alternative Transportatio...,Imagining Alternative Transportation for Futur...
2,Introduction:\nThe Face on Mars is a well-kno...,1,The Face on Mars,Mistral7B_CME_v7,True,Introduc5d@-alp3$@(tion: &lt;\nThe Face on Mar...,Introduction: The Face on Mars is a well-known...
3,"Dear State Senator,\n\nThe Electoral College h...",0,Does the electoral college work?,persuade_corpus,True,"Dear State Senator, &gt;\n\nThe Electoral Coll...","Dear State Senator,\n\nThe Electoral College h..."
4,Summer assignment is a term most students woul...,0,Summer projects,persuade_corpus,False,Summer assignment is a term most students woul...,Summer assignment is a term most students woul...
...,...,...,...,...,...,...,...
995,Title: Riding the Waves: A Testimonial from a...,1,"""A Cowboy Who Rode the Waves""",Mistral7B_CME_v7,True,Title: Riding the Waves: A Testimonial from a...,Title: Riding the Waves: A Testimonial from a ...
996,Distracted Drivers\n\nMany people claim having...,0,Phones and driving,persuade_corpus,False,Ã±Dîstracted ﻿Dri​vers </div>\n\nMany people ...,Distracted Drivers\n\nMany people claim having...
997,Unmasking the Face on Mars has always been an ...,0,The Face on Mars,persuade_corpus,True,Unmasking the Face on Mars has always been an ...,Unmasking the Face on Mars has always been an ...
998,Are you up tight and tense? Well this may be d...,0,Car-free cities,train_essays,True,â€™Are you up tight and tense? Well this may b...,Are you up tight and tense Well this may be du...


In [None]:
class TextEvaluator:
    """A class to evaluate generated text against gold standard text."""
    
    @staticmethod
    def preprocess_text(text: str) -> List[str]:
        """
        Preprocesses text by converting to lowercase, removing punctuation,
        and splitting into words.
        
        Args:
            text (str): Input text to preprocess
            
        Returns:
            List[str]: List of preprocessed words
        """
        # Convert to lowercase and remove punctuation
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        
        # Split into words and remove empty strings
        return [word for word in text.split() if word]
    
    @staticmethod
    def get_ngrams(words: List[str], n: int) -> Set[Tuple[str, ...]]:
        """
        Generates n-grams from a list of words.
        
        Args:
            words (List[str]): List of words
            n (int): Size of n-grams
            
        Returns:
            Set[Tuple[str, ...]]: Set of n-grams
        """
        return set(tuple(words[i:i+n]) for i in range(len(words) - n + 1))
    
    def calculate_metrics(self, 
                         generated_text: str, 
                         gold_text: str, 
                         method: str = 'exact',
                         ngram_size: int = 1) -> Dict[str, float]:
        """
        Calculates precision, recall, and F1 score between generated and gold text.
        
        Args:
            generated_text (str): The text generated by the LLM
            gold_text (str): The gold standard reference text
            method (str): Evaluation method - 'exact' or 'partial'
            ngram_size (int): Size of n-grams for partial matching (default: 1)
            
        Returns:
            Dict[str, float]: Dictionary containing precision, recall, and F1 scores
        """
        # Preprocess both texts
        gen_words = self.preprocess_text(generated_text)
        gold_words = self.preprocess_text(gold_text)
        
        if method == 'exact':
            # For exact matching, convert to sets
            gen_set = set(gen_words)
            gold_set = set(gold_words)
            
            # Calculate true positives, false positives, and false negatives
            true_positives = len(gen_set.intersection(gold_set))
            false_positives = len(gen_set - gold_set)
            false_negatives = len(gold_set - gen_set)
            
        elif method == 'partial':
            # Generate n-grams for both texts
            gen_ngrams = self.get_ngrams(gen_words, ngram_size)
            gold_ngrams = self.get_ngrams(gold_words, ngram_size)
            
            # Calculate true positives, false positives, and false negatives
            true_positives = len(gen_ngrams.intersection(gold_ngrams))
            false_positives = len(gen_ngrams - gold_ngrams)
            false_negatives = len(gold_ngrams - gen_ngrams)
        
        else:
            raise ValueError("Method must be either 'exact' or 'partial'")
        
        # Calculate precision, recall, and F1
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        return {
            'precision': precision,
            'recall': recall,
            'f1': f1
        }

    def evaluate_corpus(self,
                       generated_texts: List[str],
                       gold_texts: List[str],
                       method: str = 'exact',
                       ngram_size: int = 1) -> Dict[str, float]:
        """
        Evaluates a corpus of generated texts against gold standard texts.
        
        Args:
            generated_texts (List[str]): List of generated texts
            gold_texts (List[str]): List of gold standard texts
            method (str): Evaluation method - 'exact' or 'partial'
            ngram_size (int): Size of n-grams for partial matching
            
        Returns:
            Dict[str, float]: Dictionary containing averaged metrics
        """
        if len(generated_texts) != len(gold_texts):
            raise ValueError("Number of generated and gold texts must match")
        
        # Calculate metrics for each pair
        all_metrics = [
            self.calculate_metrics(gen, gold, method, ngram_size)
            for gen, gold in zip(generated_texts, gold_texts)
        ]
        
        # Calculate averages
        avg_metrics = {
            'precision': sum(m['precision'] for m in all_metrics) / len(all_metrics),
            'recall': sum(m['recall'] for m in all_metrics) / len(all_metrics),
            'f1': sum(m['f1'] for m in all_metrics) / len(all_metrics)
        }
        
        return avg_metrics

In [12]:
# Example usage
evaluator = TextEvaluator()


# Example with multiple texts
generated_corpus = df['LLM_cleaned_text'].tolist()
# convert all values to string
generated_corpus = [str(i) for i in generated_corpus]
gold_corpus = df['text'].tolist()
# convert all values to string
gold_corpus = [str(i) for i in gold_corpus]

# Evaluate corpus
corpus_metrics = evaluator.evaluate_corpus(generated_corpus, gold_corpus, method='partial', ngram_size=2)
print("Corpus-level metrics:", corpus_metrics)

Corpus-level metrics: {'precision': 0.9725995868360269, 'recall': 0.9467114492701354, 'f1': 0.9533539401776795}
