# Evaluating impacts and mitigation strategies of missing abstracts

In this analysis we aim to understand the impact of missing abstracts and our mitigation attempts by adding missing abstracts.
We do so by picking a random sample of 1M works and trying to retrieve abstracts for those

In [None]:
import gzip
import logging
from pathlib import Path

import orjson as json
import tqdm
import httpx

## Setup

In [None]:
snapshot = Path('/mnt/bulk/openalex/openalex-snapshot')
target = Path('/mnt/bulk/openalex/oa_ids.txt')
sample = Path('/mnt/bulk/openalex/oa_ids_sample.txt')
oa_base = 'http://10.10.12.41:8984/'

## 1) Prepare list of OpenAlex IDs

In [None]:
partitions = list(sorted(snapshot.glob(f'data/works/**/*.gz')))
logging.info(f'Looks like there are {len(partitions):,} partitions.')
logging.getLogger('root').setLevel(logging.WARNING)

progress = tqdm.tqdm(total=len(partitions))

n_total = 0
n_failed = 0
with open(target, 'w') as f_out:
    for pi, partition in enumerate(partitions, 1):
        progress.set_postfix_str(
            f'total={n_total:,}, '
            f'failed={n_failed:,}, '
            f'filesize={partition.stat().st_size / 1024 / 1024 / 1024:,.2f}GB, '
            f'partition={"/".join(partition.parts[-2:])}',
        )

        with gzip.open(partition, 'rb') as f_in:
            progress.set_description_str(f'READ ({pi:,})')
            ids = [json.loads(line).get('id','')[len('https://openalex.org/'):] for line in f_in]
            f_out.write('\n'.join(ids) + '\n')
        progress.update()

## 2) Select random sample

In [None]:
%%bash
shuf -n 1000000 oa_ids.txt > oa_ids_sample.txt

## 3) Load extra info for samples

In [None]:
httpx.get('')