# Epigrams Andreas Rhoby

In [None]:
import os
import re
import pandas as pd
import torch
from docx import Document
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "papluca/xlm-roberta-base-language-detection"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir='./.cache')
model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir='./.cache')
label_mapping = {0: 'af', 1: 'ar', 2: 'bg', 3: 'bn', 4: 'de', 5: 'en', 6: 'es', 7: 'fr', 8: 'hi', 9: 'it',
                 10: 'ja', 11: 'ko', 12: 'nl', 13: 'pl', 14: 'pt', 15: 'ru', 16: 'sw', 17: 'th', 18: 'tr', 19: 'ur',
                 20: 'vi', 21: 'zh'}

def detect_language(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.softmax(outputs.logits, dim=1)
    lang_id = torch.argmax(predictions, dim=1).item()
    return label_mapping[lang_id]

def is_german(text):
    lang = detect_language(text)
    return lang == "de"

def extract_epigram_id(text):
    match = re.match(r'^(Nr\.\s+[A-Za-z0-9\-]+)\)?', text)
    return match.group(1) if match else None

def remove_line_nrs(text):
    return re.sub(r'^\d+\t', '', text.strip())

def is_greek(text):
    greek_pattern = r'[\u0370-\u03FF\u1F00-\u1FFF]'
    return bool(re.search(greek_pattern, text))

def clean_document_title(filename):
    filename = filename.replace('.docx', '')
    filename = re.sub(r'^Rhoby_\d+-', '', filename)
    return filename

def process_docx(file_path, work_title, doc_title):
    doc = Document(file_path)
    extracted_data = []
    nr_value = None

    for para in doc.paragraphs:
        para_text = remove_line_nrs(para.text.strip())

        if not para_text:
            continue

        new_nr_value = extract_epigram_id(para_text)
        if new_nr_value:
            nr_value = new_nr_value

        if is_greek(para_text) or new_nr_value or nr_value:
            extracted_data.append([para_text, nr_value, work_title, doc_title])

    if not extracted_data:
        return pd.DataFrame(columns=['Text', 'NR', 'work_title', 'doc_title'])

    return pd.DataFrame(extracted_data, columns=['Text', 'NR', 'work_title', 'doc_title'])

input_folder = "rhoby"
output_folder = "rhoby_processed"
os.makedirs(output_folder, exist_ok=True)

for root, dirs, files in sorted(os.walk(input_folder), key=lambda x: x[0]):
    for file in sorted(files):
        if file.endswith(".docx"):
            file_path = os.path.join(root, file)
            work_title = os.path.basename(root)
            doc_title = clean_document_title(file)
            output_file = os.path.join(output_folder, f"{work_title}_{doc_title}.csv")

            try:
                df = process_docx(file_path, work_title, doc_title)

                if df.empty:
                    continue

                if os.path.exists(output_file):
                    continue

                df = df[~df['Text'].apply(is_german)]

                df_grouped = df.groupby(['NR', 'work_title', 'doc_title'], as_index=False).agg({'Text': lambda x: '\n'.join(x)})

                df_grouped['Text'] = df_grouped['Text'].str.split("——").str[0]

                df_grouped.to_csv(output_file, index=False)

            except Exception as e:
                print(f"Error processing {file}: {e}")

print("Processing complete. All processed files saved to rhoby_processed folder.")


Rename and add id info

In [None]:
df = pd.read_csv('~/Downloads/final_epigrams_rhoby.csv')

replacements = {
    'A_steine_publikation': 'band3_stein',
    'B_epigramme_iv_rhoby': 'ban4_illuminierten_handschriften',
    'C_frmo_varia_habilfwf': 'band1_fresken_mosaiken',
    'D_rhoby2_bandfertig': 'band2_ikonen_kleinkunst'
}

df['work_title'] = df['work_title'].replace(replacements)

df = df.dropna(how='all')

def split_nr(nr):
    if not isinstance(nr, str):
        return None, None

    match = re.search(r'Nr\.\s*([A-Za-z]*)(\d+)', nr)
    if match:
        epigram_group = match.group(1) if match.group(1) else None
        epigram_id = int(match.group(2))
        return epigram_group, epigram_id
    return None, None

df[['epigram_group', 'epigram_id']] = df['NR'].apply(lambda x: pd.Series(split_nr(x)))

df.to_csv('~/Downloads/final_epigrams_updated_volnames.csv',index=False)

Print missing nrs

In [None]:
import pandas as pd
import re
from IPython.display import display

df = pd.read_csv('~/Downloads/final_epigrams_updated_volnames.csv')

invalid_rows = df[df['epigram_id'].isna()]
if not invalid_rows.empty:
    print("\nRows to be dropped due to invalid NR values:")
    display(invalid_rows)

df = df.dropna(subset=['epigram_id'])

df['epigram_id'] = df['epigram_id'].astype(int)

df = df.sort_values(by=['work_title', 'doc_title'])

missing_total = 0

for (work_title, doc_title), group in df.groupby(['work_title', 'doc_title']):
    epigram_ids = sorted(group['epigram_id'].tolist())

    min_id, max_id = epigram_ids[0], epigram_ids[-1]

    missing_numbers = sorted(set(range(min_id, max_id + 1)) - set(epigram_ids))

    if missing_numbers:
        print(f"\nWork Title: {work_title}")
        print(f"Document Title: {doc_title}")
        print(f"Epigram ID Range: {min_id}-{max_id}")
        print(f"Numbers Missing: {', '.join(map(str, missing_numbers))}\n")
        missing_total += len(missing_numbers)  # Increment the total count of missing numbers

print(f"\nTotal Missing Numbers: {missing_total}")
print(f"Dataframe Shape: {df.shape}")
