# Cord19 dataset overview
Let's quickly overview the Cord19 dataset, which is a text corpus of COVID-19 related papers.

In [1]:
import os
from datasets import load_dataset
print(os.getcwd())

  from .autonotebook import tqdm as notebook_tqdm


/home/onoo/projects/llm-finetune


In [2]:
dataset = load_dataset(
    path='allenai/cord19',
    name='fulltext',
    split='train',
    cache_dir='./cache/',
    trust_remote_code=True
)

In [3]:
dataset[0].keys()

dict_keys(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'abstract', 'publish_time', 'authors', 'journal', 'url', 'fulltext'])

In [4]:
# let's take a look at first 10 examples
for i in range(10):
    data = dataset[i]
    print(data['title'])
    print(data['abstract'])
    print()

Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia
OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35

In [5]:
len(dataset)

368618

In [6]:
# let's tokenize some text
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [7]:
text = dataset[0]['abstract']
inputs = tokenizer(text)
inputs

{'input_ids': [9864, 23680, 9306, 25, 770, 41432, 8262, 2423, 8477, 262, 24574, 12371, 290, 8668, 3033, 286, 2319, 3871, 351, 3968, 12, 42874, 2011, 66, 20106, 11797, 35647, 68, 16079, 379, 2677, 23547, 1031, 528, 2059, 9256, 11, 449, 6048, 993, 11, 7420, 9671, 13, 337, 36252, 50, 25, 28021, 351, 3967, 337, 13, 35647, 68, 13817, 422, 22949, 27569, 422, 3269, 8309, 832, 3426, 7795, 547, 5174, 832, 262, 4527, 43592, 4406, 13, 609, 5889, 286, 3871, 547, 11765, 13, 15731, 35342, 25, 2319, 3871, 547, 5174, 11, 4747, 357, 6469, 13, 20, 4407, 286, 4150, 2672, 13938, 13, 4042, 16079, 357, 5892, 13, 20, 4407, 547, 2055, 12, 43561, 1202, 13, 383, 10280, 5676, 477, 2479, 2628, 475, 373, 749, 2219, 287, 18775, 357, 2624, 13, 20, 4407, 290, 662, 12, 14347, 1751, 357, 1828, 13, 20, 18823, 632, 5091, 614, 12, 744, 475, 373, 749, 2219, 287, 262, 2121, 357, 2327, 4407, 290, 6076, 357, 1270, 18823, 3125, 621, 1115, 12, 8230, 286, 3871, 357, 3324, 13, 20, 4407, 550, 401, 273, 14065, 871, 13, 22381, 12, 1

In [8]:
# try our custom dataset
from src.llm_finetune.dataset import ConstantLengthDataset

custom_dataset = ConstantLengthDataset(
    tokenizer, dataset, seq_length=128, num_of_sequences=8
)

In [9]:
c = 10

for batch in custom_dataset:
    tokens = batch['input_ids']
    print(tokens.shape, tokens)
    
    c -= 1
    if c == 0:
        break

torch.Size([128]) tensor([ 7085,  5745,   389,  1090,    83, 11608,   393,   772, 31309,  2854,
         4542,   780,   286, 13156, 15964,  2854,   290, 44365,   287,  2854,
           12,  3106,  1414,  2233,   284,   262,  7375, 11008,    12,  1129,
         4902,    13, 43607,   284,   428,  3957,   290, 20294,  5182,    11,
          356,  7267,   326,   340,   318,  2592,  1593,  1141,   262,  4902,
          284,   407,   691,  2555,   475,   635, 12160,  2854,  4542,   284,
        10996,   257,  4081,   338, 10039,  4571,    11,  2824,  8119,  1597,
         1366,    11,  2148,  4688,  7538,   284,  3925,   290,   670, 24432,
           11,  1805,  5745,   422,  2742,  7476,    11,   290, 12377,  1353,
         7401,    13,  1675,   466,   523,    11,   356,  2897,   257,  4610,
          284, 10980,   262,  6459,  3917,   351, 15964,  2854,  1141,   257,
         4902,    13, 22426,    11,   356,  9117,   290,  4292,  2402,   262,
          880,    12, 27718,  3433, 10335, 195

In [10]:
dataset = load_dataset(
    path='allenai/cord19',
    name='metadata',
    split='train',
    cache_dir='./cache/',
    trust_remote_code=True
)

Generating train split: 100%|██████████| 368618/368618 [00:22<00:00, 16134.41 examples/s]


In [11]:
dataset[0]

{'cord_uid': 'ug7v899j',
 'sha': 'd1aafb70c066a2068b02786f8929fd9c900897fb',
 'source_x': 'PMC',
 'title': 'Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia',
 'doi': '10.1186/1471-2334-1-6',
 'abstract': 'OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the f