In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

In [2]:
def generate_answer_dataframe(file_path: str) -> pd.DataFrame:
    """
    Reads a JSON file and generates a DataFrame with section information.
    
    Args:
        file_path (str): Path to the JSON file.
        
    Returns:
        pd.DataFrame: DataFrame containing section information.
    """
    with open(file_path, 'r') as file:
        data = json.load(file)

    rows = []
    for name, sections in data.items():
        for number, section_name in sections.items():
            rows.append({'pdf_name': name, 'section_number': number, 'section_title': section_name, 'source_db': "manual"})

    return pd.DataFrame(rows)

In [3]:
def load_ground_truth_and_predictions(parquet_file, json_file):
    """Load your dataset and JSON predictions"""
    df = pd.read_parquet(parquet_file)
    
    manual_validation = generate_answer_dataframe(json_file)

    concatenated_pdf = pd.concat([df, manual_validation], ignore_index=True)
    
    return concatenated_pdf

In [4]:
manual_validation = generate_answer_dataframe("/home/pramos/Documents/AutoSLR/validations/regex_validation/answer_template.json")
manual_validation

Unnamed: 0,pdf_name,section_number,section_title,source_db
0,amraoui2022_splc,1,INTRODUCTION,manual
1,amraoui2022_splc,2,FROM PARTIAL KNOWLEDGE TO AN SPL,manual
2,amraoui2022_splc,3,DESIGNING AN EVOLVABLE SPL WITH PARTIAL KNOWLEDGE,manual
3,amraoui2022_splc,4,APPLICATION,manual
4,amraoui2022_splc,5,DISCUSSION,manual
...,...,...,...,...
684,Vitui2021ese,5,Case Study Results,manual
685,Vitui2021ese,6,Discussions,manual
686,Vitui2021ese,7,Threats to Validity,manual
687,Vitui2021ese,8,Related Work,manual


In [5]:
df = load_ground_truth_and_predictions("/home/pramos/Documents/AutoSLR/validations/regex_validation/results/section_grouped.parquet","/home/pramos/Documents/AutoSLR/validations/regex_validation/answer_template.json")
df

Unnamed: 0,pdf_name,section_number,section_title,source_db
0,Krishna2021,1,INTRODUCTION,specific_regex.db
1,Krishna2021,2,MOTIVATION,specific_regex.db
2,Krishna2021,3,DEFINITIONS AND PROBLEM STATEMENT,specific_regex.db
3,Krishna2021,4,BEETLE: BELLWETHER TRANSFER LEARNER,specific_regex.db
4,Krishna2021,5,OTHER TRANSFER LEARNING METHODS,specific_regex.db
...,...,...,...,...
7574,Vitui2021ese,5,Case Study Results,manual
7575,Vitui2021ese,6,Discussions,manual
7576,Vitui2021ese,7,Threats to Validity,manual
7577,Vitui2021ese,8,Related Work,manual


In [6]:
def remove_unwanted_sections_flexible(df):
    """
    Remove rows with section titles containing specific keywords
    """
    # Define keywords to remove (case-insensitive)
    keywords_to_remove = ['abstract', 'keywords', 'ccs concepts', 'references', 
                         'acm reference format']
    
    # Create a mask - keep rows that don't contain any of these keywords
    mask = True
    for keyword in keywords_to_remove:
        mask = mask & (~df['section_title'].str.lower().str.contains(keyword, na=False))
    
    # Add regex pattern for appendix variations
    # This matches: appendix, appendices, appendix a, appendix b, etc.
    appendix_pattern = r'appendi(x|ces)(\s+[a-z])?'
    mask = mask & (~df['section_title'].str.lower().str.contains(appendix_pattern, na=False, regex=True))
    
    # Filter the dataframe
    filtered_df = df[mask].copy()
    return filtered_df


In [7]:
df = remove_unwanted_sections_flexible(df)
df

  mask = mask & (~df['section_title'].str.lower().str.contains(appendix_pattern, na=False, regex=True))


Unnamed: 0,pdf_name,section_number,section_title,source_db
0,Krishna2021,1,INTRODUCTION,specific_regex.db
1,Krishna2021,2,MOTIVATION,specific_regex.db
2,Krishna2021,3,DEFINITIONS AND PROBLEM STATEMENT,specific_regex.db
3,Krishna2021,4,BEETLE: BELLWETHER TRANSFER LEARNER,specific_regex.db
4,Krishna2021,5,OTHER TRANSFER LEARNING METHODS,specific_regex.db
...,...,...,...,...
7574,Vitui2021ese,5,Case Study Results,manual
7575,Vitui2021ese,6,Discussions,manual
7576,Vitui2021ese,7,Threats to Validity,manual
7577,Vitui2021ese,8,Related Work,manual


In [16]:
def check_correct_predictions(concatenated_df):
    """
    Check how many automatic predictions are correct by comparing with manual validation
    """
    # Separate manual (ground truth) and automatic predictions
    manual_data = concatenated_df[concatenated_df['source_db'] == 'manual']
    auto_data = concatenated_df[concatenated_df['source_db'] != 'manual']
    
    # Create sets of (pdf_name, section_title) for easy comparison
    manual_sections = set(zip(manual_data['pdf_name'], manual_data['section_title']))
    auto_sections = set(zip(auto_data['pdf_name'], auto_data['section_title']))
    
    # Find correct predictions (sections that exist in both)
    correct_predictions = auto_sections.intersection(manual_sections)
    
    # Calculate metrics by source_db
    results = {}
    total_manual_sections = len(manual_sections)
    
    for source in auto_data['source_db'].unique():
        source_data = auto_data[auto_data['source_db'] == source]
        source_sections = set(zip(source_data['pdf_name'], source_data['section_title']))
        
        # True Positives: predictions that match manual validation
        true_positives = len(source_sections.intersection(manual_sections))
        
        # False Positives: predictions that don't exist in manual validation
        false_positives = len(source_sections - manual_sections)
        
        total_predictions = len(source_sections)
        
        # Precision: TP / (TP + FP) = TP / Total_Predictions
        precision = true_positives / total_predictions if total_predictions > 0 else 0
        
        # Coverage: TP / Total_Manual_Sections (how much of manual data was found)
        coverage = true_positives / total_manual_sections if total_manual_sections > 0 else 0
        
        results[source] = {
            'total_predictions': total_predictions,
            'total_manual_sections': total_manual_sections,
            'true_positives': true_positives,
            'false_positives': false_positives,
            'precision': precision,
            'coverage': coverage  # New metric!
        }
    
    return results, correct_predictions




In [17]:
results, correct_preds = check_correct_predictions(df)


In [19]:

for source, metrics in results.items():
    print(f"\nSource: {source}")
    print(f"  Total predictions: {metrics['total_predictions']}")
    print(f"  Total manual sections: {metrics['total_manual_sections']}")
    print(f"  True Positives: {metrics['true_positives']}")
    print(f"  False Positives: {metrics['false_positives']}")
    print(f"  Precision: {metrics['precision']:.2%}")
    print(f"  Coverage: {metrics['coverage']:.2%}")  # New metric!
    print("-" * 40)


Source: specific_regex.db
  Total predictions: 953
  Total manual sections: 689
  True Positives: 546
  False Positives: 407
  Precision: 57.29%
  Coverage: 79.25%
----------------------------------------

Source: voting_policy.db
  Total predictions: 688
  Total manual sections: 689
  True Positives: 479
  False Positives: 209
  Precision: 69.62%
  Coverage: 69.52%
----------------------------------------

Source: local_llms.db
  Total predictions: 535
  Total manual sections: 689
  True Positives: 378
  False Positives: 157
  Precision: 70.65%
  Coverage: 54.86%
----------------------------------------

Source: generic_regex.db
  Total predictions: 1367
  Total manual sections: 689
  True Positives: 555
  False Positives: 812
  Precision: 40.60%
  Coverage: 80.55%
----------------------------------------

Source: ORC_tag_extraction.db
  Total predictions: 750
  Total manual sections: 689
  True Positives: 605
  False Positives: 145
  Precision: 80.67%
  Coverage: 87.81%
------------

In [11]:
def detailed_comparison(concatenated_df):
    """
    Detailed comparison showing correct and incorrect predictions by PDF
    """
    manual_data = concatenated_df[concatenated_df['source_db'] == 'manual']
    auto_data = concatenated_df[concatenated_df['source_db'] != 'manual']
    
    # Group by PDF to compare
    results_by_pdf = {}
    
    for pdf_name in auto_data['pdf_name'].unique():
        # Get manual sections for this PDF
        manual_sections_for_pdf = set(
            manual_data[manual_data['pdf_name'] == pdf_name]['section_title']
        )
        
        # Get auto predictions for this PDF
        auto_sections_for_pdf = auto_data[auto_data['pdf_name'] == pdf_name]
        
        correct = []
        incorrect = []
        
        for _, row in auto_sections_for_pdf.iterrows():
            if row['section_title'] in manual_sections_for_pdf:
                correct.append((row['section_title'], row['source_db']))
            else:
                incorrect.append((row['section_title'], row['source_db']))
        
        results_by_pdf[pdf_name] = {
            'manual_sections': manual_sections_for_pdf,
            'correct_predictions': correct,
            'incorrect_predictions': incorrect
        }
    
    return results_by_pdf

# Usage
detailed_results = detailed_comparison(df)

# Print detailed results for each PDF
for pdf_name, results in detailed_results.items():
    print(f"\nPDF: {pdf_name}")
    print(f"Manual sections: {results['manual_sections']}")
    print(f"Correct predictions: {results['correct_predictions']}")
    print(f"Incorrect predictions: {results['incorrect_predictions']}")


PDF: Krishna2021
Manual sections: {'OTHER TRANSFER LEARNING METHODS', 'BEETLE: BELLWETHER TRANSFER LEARNER', 'DEFINITIONS AND PROBLEM STATEMENT', 'INTRODUCTION', 'MOTIVATION', 'EXPERIMENTAL SETUP', 'DISCUSSION', 'CONCLUSION', 'RESULTS', 'RELATED WORK', 'THREATS TO VALIDITY'}
Correct predictions: [('INTRODUCTION', 'specific_regex.db'), ('MOTIVATION', 'specific_regex.db'), ('DEFINITIONS AND PROBLEM STATEMENT', 'specific_regex.db'), ('BEETLE: BELLWETHER TRANSFER LEARNER', 'specific_regex.db'), ('OTHER TRANSFER LEARNING METHODS', 'specific_regex.db'), ('EXPERIMENTAL SETUP', 'specific_regex.db'), ('RESULTS', 'specific_regex.db'), ('DISCUSSION', 'specific_regex.db'), ('RELATED WORK', 'specific_regex.db'), ('CONCLUSION', 'specific_regex.db'), ('INTRODUCTION', 'voting_policy.db'), ('RELATED WORK', 'voting_policy.db'), ('CONCLUSION', 'voting_policy.db'), ('MOTIVATION', 'voting_policy.db'), ('DEFINITIONS AND PROBLEM STATEMENT', 'voting_policy.db'), ('BEETLE: BELLWETHER TRANSFER LEARNER', 'votin