In [3]:
import ir_datasets
import pandas as pd

dataset = ir_datasets.load("cord19")

data = []

for doc in dataset.docs_iter()[:20000]:  
 
    data.append({
        "doc_id": doc.doc_id,
        "title": doc.title,
        "doi": doc.doi,
        "date": doc.date,
        "abstract": doc.abstract
    })

df = pd.DataFrame(data)
df.to_csv("covid_dataset.csv", index=False)

print("Dataset saved to covid_dataset.csv")

## Data Sources

### Description

The [COVID-19 Open Research Dataset (CORD-19)](https://github.com/allenai/cord19) is a freely available resource of over 1,000,000 scholarly articles about COVID-19, SARS-CoV-2, and related coronaviruses. The dataset was developed by the Allen Institute for AI in collaboration with several organizations, including the White House, NIH, and leading research groups. Its goal is to facilitate the development of new tools and technologies to help researchers find relevant information about the virus and its spread, and to support the global research community in the fight against the pandemic. It was first published in March 2020 and has been updated weekly since then until May 2nd 2022.

### Characteristics

The dataset can be obtained in various formats, including JSON, CSV. It contains metadata for each article, such as the id of the document, title, abstract, publication date. It has about 193,000 articles with a total size of 3.7GB since it uses the version of 16-07-2020. Some of these articles have missing abstracts or share the same title. This can happen because the dataset is a collection of articles from different sources, and some of them may have been published in more than one place.

## Characterization 

In [2]:
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet') 

import spacy

### Text Analysis

In [3]:
from string import punctuation

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

df = df[df["abstract"] != ""]
stop_words = stopwords.words('english')

np = [
    ''.join([char for char in abs if char not in punctuation])
    for abs in df["abstract"]
]
print(np[:5])

token_abs = [word_tokenize(abs) for abs in np]
print(token_abs[:5])

abs_nstp = [
    ' '.join([word for word in abs if word.lower() not in stop_words])
    for abs in token_abs
]
print(abs_nstp[:5])

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
abs_lem = [
    ' '.join([lemmatizer.lemmatize(word) for word in abs.split()])
    for abs in abs_nstp
]
print(abs_lem[:5])

df['lem_abstract'] = abs_lem

df.to_csv("covid_dataset_preprocessed.csv", index=False)


### Word Cloud

In [4]:
import numpy as np
from wordcloud import WordCloud

import matplotlib.pyplot as plt

wc = WordCloud(
    background_color='black',
    max_words=100,
    random_state=44,
    max_font_size=110
)
wc.generate(' '.join(df['lem_abstract']))
plt.figure(figsize=(50, 7))
plt.imshow(wc)
plt.show()





### Name Entity Recognition

In [1]:
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df['entities'] = df['lem_abstract'].apply(lambda x: extract_entities(str(x)))
print(df[['entities']].head())

from spacy import displacy

displacy.render(nlp(df['lem_abstract'].iloc[0]), style='ent', jupyter=True)
## verificar sci-spacy para melhorar a extração de entidades nomeadas