In [None]:
n_lines = 100_000
filename = './anthology+abstracts.bib'
# filename = './test.bib'
idx = 0

def write_buffer(buffer, idx):
    print(f'Writing file {idx:02}')
    with open(f'./bibs/anth-{idx:02}.bib', 'w', encoding='utf-8') as fp:
        fp.writelines(buffer)

with open(filename, 'r', encoding='utf-8') as f:
    line = f.readline()
    while line != "":
        n = 0
        buffer = [ line ]
        while n < n_lines:
            buffer.append(line)
            line = f.readline()
            n += 1
        
        if line.strip() != "}":
            while line != "" and line.strip() != "}":
                buffer.append(line)
                line = f.readline()
        buffer.append(line)        
        
        # write out to temp file
        idx += 1
        write_buffer(buffer[1:], idx)
        line = f.readline()

Writing file 01
Writing file 02
Writing file 03
Writing file 04
Writing file 05
Writing file 06
Writing file 07
Writing file 08
Writing file 09
Writing file 10
Writing file 11
Writing file 12
Writing file 13
Writing file 14
Writing file 15
Writing file 16
Writing file 17
Writing file 18
Writing file 19
Writing file 20
Writing file 21


In [1]:
import bibtexparser as bp
import multiprocessing

In [2]:
def load_bib_file(filename, entity_list, wlock):
    with open(filename, 'r', encoding='utf-8') as fp:
        bib = bp.load(fp)
    entities = list(filter(lambda e: 'abstract' in e, bib.entries))
    
    with wlock:
        for e in entities:
            entity_list.append(e)
    
    print(f'Loaded {len(entities)} entries from {filename}')


def pull_file_and_process(file_list, entity_list, entity_lock, file_lock, done):
    while True:
        filename = ""

        file_lock.acquire()
        if not file_list:
            file_lock.release()
            break

        filename = file_list.pop()
        file_lock.release()
        load_bib_file(filename, entity_list, entity_lock)


In [None]:
with multiprocessing.Manager() as manager:
    acl_bib = manager.list()
    f_lock = manager.Lock()
    e_lock = manager.Lock()
    done = manager.Event()

    files = manager.list(
        [f'./bibs/anth-{i:02}' for i in range(1, 22)]
    )

    procs = []
    for proc_idx in range(multiprocessing.cpu_count()):
        proc = multiprocessing.Process(target=pull_file_and_process, args=(files, acl_bib, e_lock, f_lock, done, ))
        proc.start()
        procs.append(proc)


    for proc in procs:
        proc.join()
    
    # ejects from manager to allow access outside of manager context
    acl_bib = list(acl_bib)

Loaded 279 entries from ./bibs/anth-21
Loaded 4496 entries from ./bibs/anth-11
Loaded 4244 entries from ./bibs/anth-10
Loaded 2933 entries from ./bibs/anth-14
Loaded 970 entries from ./bibs/anth-17
Loaded 1600 entries from ./bibs/anth-15
Loaded 4533 entries from ./bibs/anth-12
Loaded 3684 entries from ./bibs/anth-13
Loaded 991 entries from ./bibs/anth-16
Loaded 1613 entries from ./bibs/anth-18
Loaded 1132 entries from ./bibs/anth-19
Loaded 849 entries from ./bibs/anth-20
Loaded 4662 entries from ./bibs/anth-06
Loaded 4371 entries from ./bibs/anth-08
Loaded 4132 entries from ./bibs/anth-07
Loaded 3889 entries from ./bibs/anth-09
Loaded 4347 entries from ./bibs/anth-02
Loaded 4375 entries from ./bibs/anth-01
Loaded 4735 entries from ./bibs/anth-04
Loaded 4276 entries from ./bibs/anth-03
Loaded 4228 entries from ./bibs/anth-05


In [3]:
import json
with open('./data/acl-entities.json', 'r', encoding='utf-8') as f:
    acl_bib = json.load(f)

In [4]:
print(f'Loaded {len(acl_bib)} entities')

Loaded 66172 entities


In [13]:
unique_vals_for_keys = {
    'publisher': [e['publisher'] for e in acl_bib if 'publisher' in e],
    'year': [e['year'] for e in acl_bib if 'year' in e],
    'booktitle': [e['booktitle'] for e in acl_bib if 'booktitle' in e],
}

for k, v in unique_vals_for_keys.items():
    print(f'{k}: {len(set(v))} unique values')
    for val in sorted(set(v)):
        print(f'  - {val}')

publisher: 56 unique values
  - ACL
  - AFCP - ATALA
  - ATALA
  - ATALA and AFPC
  - ATALA et AFCP
  - Asia-Pacific Association for Machine Translation
  - Asian Federation of Natural Language Processing
  - Association for Computational Linguistics
  - Association for Computational Linguistics, Shoumen, Bulgaria
  - Association for Computational Lingustics
  - Association for Machine Translation in the Americas
  - Association of Computational Linguistics
  - Australasian Language Technology Association
  - CEUR Workshop Proceedings
  - CLUP, Centro de Lingu{\'i}stica da Universidade do Porto FLUP - Faculdade de Letras da Universidade do Porto
  - COLING
  - CSLI Publications
  - Carnegy Mellon University
  - Chinese Information Processing Society of China
  - Department of Computational Linguistics, IBL -- BAS
  - Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences
  - ELRA and ICCL
  - European Association for Machine Translation

In [None]:
cleaned_entities = list(acl_bib)
for entry in cleaned_entities:
    if 'ID' in entry:
        entry['id'] = entry.pop('ID')
    
    if 'authors' in entry:
        entry['authors'] = [a.strip() for a in entry['authors'].split(' and ')]



In [6]:
with open('./data/acl-entities-cleaned.json', 'w', encoding='utf-8') as f:
    json.dump(cleaned_entities, f)