In [184]:
from TexSoup import TexSoup
from TexSoup.data import TexNode, BraceGroup
import csv
import os
from typing import Literal, TypeAlias


In [185]:
test_file = '../../data/dot.csv'

In [None]:
from dataclasses import dataclass

@dataclass
class INBibEntry:
    id: str
    bibkey: str
    title: str
    notes: str
    crossref: str
    further_note: str

@dataclass
class ParsedBibEntry(INBibEntry):
    further_references_raw: list[str]
    depends_on_raw: list[str]

@dataclass
class ProcessedBibEntry(INBibEntry):
    further_references_good: list[str]
    further_references_bad: list[str]
    depends_on_good: list[str]
    depends_on_bad: list[str]


CitetField: TypeAlias = Literal[
    # For further_references
    "title",
    "notes",
    # for depends_on
    "crossref",
    "further_note"
]

In [None]:
def load_bibentries_csv(filename: str, encoding: str) -> list[INBibEntry]:

    if not os.path.exists(filename):
        raise FileNotFoundError(f"File '{filename}' not found")

    with open(filename, 'r', encoding=encoding) as f:
        csv_reader = csv.DictReader(f)
        rows = [
            INBibEntry(
                id=row['id'],
                bibkey=row['bibkey'],
                title=row['title'],
                notes=row['notes'],
                crossref=row['crossref'],
                further_note=row['further_note'],
            )
            for row in csv_reader
        ]

    return rows


In [188]:
def get_citet_bibkeys(row: INBibEntry, citet_field: CitetField) -> list[str]:
    data = getattr(row, citet_field)
    citet_l = TexSoup(data).find_all('citet')
    citets_raw_nested = [citet.args for citet in citet_l if isinstance(citet, TexNode)]
    citets_raw_flat = [item for sublist in citets_raw_nested for item in sublist]
    citets_s = [citet.string.split(",") for citet in citets_raw_flat if isinstance(citet, BraceGroup)]
    citets_s_flat = [item.strip() for sublist in citets_s for item in sublist]

    return citets_s_flat

In [189]:
rows = load_bibentries_csv(test_file, 'utf-16')
res = [(row.id, get_citet_bibkeys(row, "notes")) for row in rows]


In [191]:
[item for item in res if item[1] != []]

[('163608', ['russell_b:1905']),
 ('62997',
  ['frege:1882',
   'frege:1891',
   'frege:1892',
   'frege:1892a',
   'frege:1918',
   'frege:1923',
   'frege:1918a']),
 ('7445', ['austin_jl:1961']),
 ('172159', ['sen_a:1987']),
 ('72443', ['rodislewis:1987']),
 ('184232', ['strawson_pf:1971']),
 ('78074', ['apt:1985', 'bibkey:test123'])]

In [None]:
def parse_bibentry(row: INBibEntry) -> ParsedBibEntry:
    notes_bibkeys = get_citet_bibkeys(row, "notes")
    title_bibkeys = get_citet_bibkeys(row, "title")
    
    further_references_raw = notes_bibkeys + title_bibkeys

    further_notes_bibkeys = get_citet_bibkeys(row, "further_note")
    crossref_bibkeys = get_citet_bibkeys(row, "crossref")

    depends_on_raw = further_references_raw + further_notes_bibkeys + crossref_bibkeys

    return ParsedBibEntry(
        id=row.id,
        bibkey=row.bibkey,
        title=row.title,
        notes=row.notes,
        crossref=row.crossref,
        further_note=row.further_note,
        further_references_raw=further_references_raw,
        depends_on_raw=depends_on_raw
    )

In [None]:
def get_all_bibkeys(rows: list[INBibEntry]) -> list[str]:

    all_bibkeys = [
        row.bibkey
        for row in rows
    ]

    return all_bibkeys


In [None]:
def process_bibentry(parsed_bibentry: ParsedBibEntry, all_bibkeys_list: list[str]) -> ProcessedBibEntry:
    further_references_good = []
    further_references_bad = []
    depends_on_good = []
    depends_on_bad = []

    for bibkey in parsed_bibentry.further_references_raw:
        if bibkey in all_bibkeys_list:
            further_references_good.append(bibkey)
        else:
            further_references_bad.append(bibkey)

    for bibkey in parsed_bibentry.depends_on_raw:
        if bibkey in all_bibkeys_list:
            depends_on_good.append(bibkey)
        else:
            depends_on_bad.append(bibkey)

    return ProcessedBibEntry(
        id=parsed_bibentry.id,
        bibkey=parsed_bibentry.bibkey,
        title=parsed_bibentry.title,
        notes=parsed_bibentry.notes,
        crossref=parsed_bibentry.crossref,
        further_note=parsed_bibentry.further_note,
        further_references_good=further_references_good,
        further_references_bad=further_references_bad,
        depends_on_good=depends_on_good,
        depends_on_bad=depends_on_bad
    )

In [196]:
def main(filename: str, encoding: str, output_filename: str) -> None:
    rows = load_bibentries_csv(filename, encoding)
    parsed_rows = (parse_bibentry(row) for row in rows)
    all_bibkeys = get_all_bibkeys(rows)
    processed_rows = (process_bibentry(parsed_row, all_bibkeys) for parsed_row in parsed_rows)

    with open(output_filename, 'w', encoding=encoding) as f:
        writer = csv.DictWriter(f, fieldnames=ProcessedBibEntry.__annotations__.keys())
        writer.writeheader()
        writer.writerows([row.__dict__ for row in processed_rows])
    
    print(f"Processed {len(rows)} entries")

    return None


In [199]:
def process_bibentry_loop(parsed_bibentry: ParsedBibEntry, all_bibkeys_list: list[str]) -> ProcessedBibEntry:
    further_references_good = []
    further_references_bad = []
    depends_on_good = []
    depends_on_bad = []

    for bibkey in parsed_bibentry.further_references_raw:
        if bibkey in all_bibkeys_list:
            further_references_good.append(bibkey)
        else:
            further_references_bad.append(bibkey)

    for bibkey in parsed_bibentry.depends_on_raw:
        if bibkey in all_bibkeys_list:
            depends_on_good.append(bibkey)
        else:
            depends_on_bad.append(bibkey)

    return ProcessedBibEntry(
        id=parsed_bibentry.id,
        bibkey=parsed_bibentry.bibkey,
        title=parsed_bibentry.title,
        notes=parsed_bibentry.notes,
        crossref=parsed_bibentry.crossref,
        further_note=parsed_bibentry.further_note,
        further_references_good=further_references_good,
        further_references_bad=further_references_bad,
        depends_on_good=depends_on_good,
        depends_on_bad=depends_on_bad
    )

In [None]:
def process_bibentry_lc(parsed_bibentry: ParsedBibEntry, all_bibkeys_list: list[str]) -> ProcessedBibEntry:
    further_refs = ((bibkey, 0) if bibkey in all_bibkeys_list else (bibkey, 1) for bibkey in parsed_bibentry.further_references_raw)
    depends_on = ((bibkey, 0) if bibkey in all_bibkeys_list else (bibkey, 1) for bibkey in parsed_bibentry.depends_on_raw)

    further_references_good = [bibkey for bibkey, status in further_refs if status == 0]
    further_references_bad = [bibkey for bibkey, status in further_refs if status == 1]
    depends_on_good = [bibkey for bibkey, status in depends_on if status == 0]
    depends_on_bad = [bibkey for bibkey, status in depends_on if status == 1]


    return ProcessedBibEntry(
        id=parsed_bibentry.id,
        bibkey=parsed_bibentry.bibkey,
        title=parsed_bibentry.title,
        notes=parsed_bibentry.notes,
        crossref=parsed_bibentry.crossref,
        further_note=parsed_bibentry.further_note,
        further_references_good=further_references_good,
        further_references_bad=further_references_bad,
        depends_on_good=depends_on_good,
        depends_on_bad=depends_on_bad
    )

In [203]:
import timeit

# Define a large dataset for testing
parsed_bibentry = ParsedBibEntry(
    id="1",
    bibkey="key1",
    title="Title",
    notes="Notes",
    crossref="Crossref",
    further_note="Further Note",
    further_references_raw=["key" + str(i) for i in range(10000)],
    depends_on_raw=["key" + str(i) for i in range(10000)]
)
all_bibkeys_list = ["key" + str(i) for i in range(5000)]

# Measure the performance of the for loop approach
time_for_loops = timeit.timeit(
    lambda: process_bibentry(parsed_bibentry, all_bibkeys_list),
    number=100
)

# Measure the performance of the list comprehension approach
time_list_comprehensions = timeit.timeit(
    lambda: process_bibentry(parsed_bibentry, all_bibkeys_list),
    number=100
)

print(f"Time for loops: {time_for_loops}")
print(f"Time for list comprehensions: {time_list_comprehensions}")

Time for loops: 106.07913959399957
Time for list comprehensions: 76.9970980550006
