In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

from matplotlib import pyplot as plt
from rapidfuzz import process, fuzz

from sklearn.metrics import f1_score, accuracy_score
import warnings
warnings.filterwarnings('ignore')
import re

In [None]:
# Load the CSV file
df = pd.read_csv('./irr_publication.csv')

# Format floats to cut trailing zeros
df = df.applymap(lambda x: f"{x:.2f}" if isinstance(x, float) else x)

latex_table = df.to_latex(index=False)
print(latex_table)

In [28]:
# Load longtable
df = pd.read_csv('./irr_industry_size.csv')

# Sort the dataframe by Industry and Size (ensuring the correct order small, medium, large)
size_order = ['small', 'medium', 'large']
df['Size'] = pd.Categorical(df['Size'], categories=size_order, ordered=True)
df.sort_values(['Industry', 'Size'], inplace=True)

# Prepare the LaTeX longtable
latex_table = "\\begin{longtable}{|c|c|c|c|c|c|c|}\n"
latex_table += "\\hline\n"
latex_table += "Industry & Size & Company \\\\ F1 & Company Accuracy & Breach \\\\ F1 & Breach Accuracy & Sample Size \\\\ \n"
latex_table += "\\hline\n"
latex_table += "\\endfirsthead\n"  # Header for the first page

latex_table += "\\hline\n"
latex_table += "Industry & Size & Company \\\\ F1 & Company Accuracy & Breach \\\\ F1 & Breach Accuracy & Sample Size \\\\ \n"
latex_table += "\\hline\n"
latex_table += "\\endhead\n"  # Header for subsequent pages

latex_table += "\\hline\n"
latex_table += "\\endfoot\n"  # Footer

# Track the previous industry to repeat the industry name across rows
previous_industry = None
industry_row_count = 0
industry_total_rows = df['Industry'].value_counts().to_dict()

for i, row in df.iterrows():
    # Escape the industry name for LaTeX compatibility
    industry_value = row['Industry'].replace("&", "\\&").replace("%", "\\%").replace("_", "\\_").replace("#", "\\#")

    if row['Industry'] != previous_industry:
        # Update the industry value for the first row of a new industry
        previous_industry = row['Industry']
        industry_row_count = 1
        # Add the row with the industry name
        latex_table += f"{industry_value} & {row['Size']} & {row['Company F1']} & {row['Company Accuracy']} & "
        latex_table += f"{row['Breach F1']} & {row['Breach Accuracy']} & {row['Sample Size']} \\\\ \n"
    else:
        # Add the row without the industry name
        industry_row_count += 1
        latex_table += f" & {row['Size']} & {row['Company F1']} & {row['Company Accuracy']} & "
        latex_table += f"{row['Breach F1']} & {row['Breach Accuracy']} & {row['Sample Size']} \\\\ \n"

    # Add a horizontal line only if it's the last row of the current industry
    if industry_row_count == industry_total_rows[previous_industry]:
        latex_table += "\\hline\n"
    else:
        latex_table += "\\cline{2-7}\n"  # Partial line for the other columns
    
    
latex_table += "\\end{longtable}"

# Print or save the LaTeX table
print(latex_table)


\begin{longtable}{|c|c|c|c|c|c|c|}
\hline
Industry & Size & Company \\ F1 & Company Accuracy & Breach \\ F1 & Breach Accuracy & Sample Size \\ 
\hline
\endfirsthead
\hline
Industry & Size & Company \\ F1 & Company Accuracy & Breach \\ F1 & Breach Accuracy & Sample Size \\ 
\hline
\endhead
\hline
\endfoot
accounting & small & 0.67 & 0.5 & 0.0 & 1.0 & 2 \\ 
\cline{2-7}
 & medium & nan & nan & nan & nan & 0 \\ 
\cline{2-7}
 & large & 1.0 & 1.0 & 1.0 & 1.0 & 1 \\ 
\hline
airlines/aviation & small & 1.0 & 1.0 & 1.0 & 1.0 & 2 \\ 
\cline{2-7}
 & medium & nan & nan & nan & nan & 0 \\ 
\cline{2-7}
 & large & nan & nan & nan & nan & 0 \\ 
\hline
apparel \& fashion & small & 0.4 & 0.25 & 1.0 & 1.0 & 4 \\ 
\cline{2-7}
 & medium & nan & nan & nan & nan & 0 \\ 
\cline{2-7}
 & large & nan & nan & nan & nan & 0 \\ 
\hline
arts and crafts & small & 0.67 & 0.5 & 0.0 & 1.0 & 2 \\ 
\cline{2-7}
 & medium & nan & nan & nan & nan & 0 \\ 
\cline{2-7}
 & large & nan & nan & nan & nan & 0 \\ 
\hline
automotive 