In [1]:
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

# DOC_PATTERN = r'.*\.json'

# corpus = PlaintextCorpusReader(f'./data/documents_json/', DOC_PATTERN)

# # corpus.fileids()

In [2]:
from langdetect import detect
from langdetect import DetectorFactory

def get_lang(article):
    text = '\n'.join(article['body_text']).split(" ")
    # print(text[:50])

    lang = "en"
    try:
        if len(text) > 50:
            lang = detect(" ".join(text[:50]))
        elif len(text) > 0:
            lang = detect(" ".join(text[:]))
    # ught... beginning of the document was not in a good format
    except Exception as e:
        all_words = set(text)
        try:
            lang = detect(" ".join(all_words))
        # what!! :( let's see if we can find any text in abstract...
        except Exception as e:
            
            try:
                # let's try to label it through the abstract then
                lang = detect(' '.join(article['abstract_summary']))
            except Exception as e:
                lang = "unknown"
                pass
    
    return lang


In [22]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [3]:
import string
import spacy
import scispacy
import en_core_sci_lg
from spacy.lang.en.stop_words import STOP_WORDS

punctuations = string.punctuation
stopwords = list(STOP_WORDS)
stopwords[:10]

custom_stop_words = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 
    'al.', 'Elsevier', 'PMC', 'CZI', 'www'
]

for w in custom_stop_words:
    if w not in stopwords:
        stopwords.append(w)

# Parser
parser = en_core_sci_lg.load(disable=["tagger", "ner"])
parser.max_length = 7000000

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    # mytokens = " ".join([i for i in mytokens])
    return mytokens

In [17]:
import codecs
import json
import pandas as pd

from nltk import sent_tokenize

DOC_PATTERN = r'.*\.json'

class ArticleCorpusReader(PlaintextCorpusReader, CorpusReader):
    def __init__(self, root, fileids=DOC_PATTERN, **kwargs):
        if any(key.startswith('metadata_path') for key in kwargs.keys()):
            self.metadata_path = kwargs['metadata_path']
            self.df_metadata = pd.read_csv(self.metadata_path, dtype={
                'pubmed_id': str,
                'Microsoft Academic Paper ID': str, 
                'doi': str
            })
            # print('has key')
        # print(self.df_metadata)

        PlaintextCorpusReader.__init__(self, root, fileids, kwargs)
        CorpusReader.__init__(self, root, fileids, kwargs)
    
    def docs(self, fileids=None):
        for path, encoding in self.abspaths(fileids, include_encoding=True):
            with codecs.open(path, 'r', encoding=encoding) as file:
                content = json.load(file)
                dict_ = {'paper_id': None, 'abstract': None, 'body_text': None}

                dict_['paper_id'] = content['paper_id']
                dict_['abstract'] = []
                dict_['body_text'] = []
                # Abstract
                for entry in content['abstract']:
                    dict_['abstract'].append(entry['text'])
                # Body text
                for entry in content['body_text']:
                    dict_['body_text'].append(entry['text'])
                # dict_['abstract'] = '\n\n'.join(dict_['abstract'])
                # dict_['body_text'] = '\n\n'.join(dict_['body_text'])

                yield dict_

    def metadata(self, fileids=None):
        if fileids == None:
            # not working
            yield None
        else:
            for doc in self.docs(fileids):
                # get metadata infomation
                self.df_metadata = self.df_metadata.loc[self.df_metadata['sha'] == doc['paper_id']]

                if len(self.df_metadata) == 0:
                    yield None

                yield self.df_metadata

    def articles(self, fileids=None):
        if fileids == None:
            yield None
        else:
            for doc in self.docs(fileids):
                dict_ = {'paper_id': None, 'doi': None, 'abstract': None, 'body_text': None, 'authors': [], 'title': None, 'journal': None, 'abstract_summary': None,
                'abstract_word_count': 0, 'body_word_count': 0, 'body_unique_words': 0}

                dict_['abstract'] = doc['abstract']
                dict_['paper_id'] = doc['paper_id']
                dict_['body_text'] = doc['body_text']

                abstract_text = '\n'.join(doc['abstract'])

                # also create a column for the summary of abstract to be used in a plot
                if len(doc['abstract']) == 0:
                    # no abstract provided
                    dict_['abstract_summary'] = ["Not provided."]
                elif len(abstract_text.split(' ')) > 100:
                    # abstract provided is too long for plot, take first 100 words append with ...
                    info = abstract_text.split(' ')[:100]
                    summary = get_breaks(' '.join(info), 40)
                    dict_['abstract_summary'] = summary + "..."
                else:
                    # abstract is short enough
                    summary = get_breaks(abstract_text, 40)
                    dict_['abstract_summary'] = summary
                
                try:
                    # if more than one author
                    authors = self.df_metadata['authors'].values[0].split(';')
                    if len(authors) > 2:
                        # if more than 2 authors, take them all with html tag breaks in between
                        dict_['authors'].append(get_breaks('. '.join(authors), 40))
                    else:
                        # authors will fit in plot
                        dict_['authors'].append(". ".join(authors))
                except Exception as e:
                    # if only one author - or Null value
                    dict_['authors'].append(self.df_metadata['authors'].values[0])
                
                # get word counts
                dict_['abstract_word_count'] = len('\n'.join(dict_['abstract']).strip().split())
                dict_['body_word_count'] = len('\n'.join(dict_['body_text']).strip().split())
                dict_['body_unique_words'] = len(set('\n'.join(dict_['body_text']).split()))

                dict_['lang'] = get_lang(dict_);

                yield dict_

    def sizes(self, fileids=None):
        for path in self.abspaths(fileids):
            yield os.path.getsize(path)

    def paras(self, fileids=None):
        for article in self.articles(fileids):
            for paragraph in article['body_text']:
                yield paragraph
    
    def sents(self, fileids=None):
        for paragraph in self.paras(fileids):
            for sentence in sent_tokenize(paragraph):
                yield sentence

    def words(self, fileids=None):
        for sentence in self.sents(fileids):
            for word_tok in spacy_tokenizer(sentence):
                yield word_tok

In [20]:
article_reader = ArticleCorpusReader(f'./data/documents_json/', metadata_path=f'./data/metadata.csv')

In [25]:
# self.df_metadata = None
# [print(article) for article in article_reader.metadata()]
# [str(article) for article in article_reader.docs(article_reader.fileids()[0])]
# [str(article) for article in article_reader.metadata(article_reader.fileids()[0])]
# [str(article) for article in article_reader.articles(article_reader.fileids()[0])]
# [str(article) + ' kbs' for article in article_reader.sizes(article_reader.fileids()[0])]
# [str(article) for article in article_reader.paras(article_reader.fileids()[0])]
# [str(article) for article in article_reader.articles(article_reader.fileids()[0])]
# [str(article) for article in article_reader.sents(article_reader.fileids()[0])]
# [str(article) for article in article_reader.words(article_reader.fileids()[0])]
next(article_reader.articles(article_reader.fileids()[0]))

{'paper_id': '0000028b5cc154f68b8a269f6578f21e31f62977',
 'doi': None,
 'abstract': [],
 'body_text': ['According to current live statistics at the time of editing this letter, Russia has been the third country in the world to be affected by COVID-19 with both new cases and death rates rising. It remains in a position of advantage due to the later onset of the viral spread within the country since the worldwide disease outbreak.',
  'The first step in "fighting" the epidemic was nationwide lock down on March 30 th , 2020.',
  'Most of the multidisciplinary hospitals have been repurposed as dedicated COVID-19 centres, so the surgeons started working as infectious disease specialists. Such a reallocation of health care capacity results in the effective management of this epidemiological problem 1 . The staff has undergone on-line 36-hour training course to become qualified in coronavirus infection treatment.',
  'The surgeons of COVID-19 dedicated hospitals do rarely practice surgery. Wh

In [7]:
spacy_tokenizer('Just a simple sentence here.')

['simple', 'sentence']

In [18]:
import os
import pickle

class Preprocessor(object):
    def __init__(self, corpus, target=None, **kwargs):
        self.corpus = corpus
        self.target = target

    def fileids(self, fileids=None):
        if fileids != None:
            return fileids
        return self.corpus.fileids()
    
    def abspath(self, fileid):
        parent = os.path.relpath(os.path.dirname(article_reader.abspath(fileid)), article_reader.root)

        basename =  os.path.basename(fileid)
        name, ext = os.path.splitext(basename)

        basename = name + '.pkl'

        return os.path.normpath(os.path.join(self.target, parent, basename))
    
    def tokenize(self, fileid):
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [ spacy_tokenizer(sent) for sent in sent_tokenize(paragraph) ]

    def process(self, fileid):
        target = self.abspath(fileid)
        parent = os.path.dirname(target)

        if not os.path.exists(parent):
            os.makedirs(parent)
        
        if not os.path.isdir(parent):
            raise ValueError("document path in not a directory")

        document = list(self.tokenize(fileid))

        with open(target, 'wb') as f:
            pickle.dump(document, f, pickle.HIGHEST_PROTOCOL)

        # del document

        return target
    
    def transform(self, fileids=None):
        if not os.path.exists(self.target):
            os.makedirs(self.target)

        for fileid in self.fileids(fileids):
            yield self.process(fileid)

In [28]:
from tqdm import tqdm

preprocessor_ = Preprocessor(article_reader, f'./data/pickled/')

# print(article_reader.abspath('0000028b5cc154f68b8a269f6578f21e31f62977.json'))
# next(preprocessor_.transform(['0000028b5cc154f68b8a269f6578f21e31f62977.json']))
# os.path.relpath(os.path.dirname(article_reader.abspath(article_reader.fileids()[0])), article_reader.root)
# article_reader.abspath('0000028b5cc154f68b8a269f6578f21e31f62977.json')
# [str(article) for article in preprocessor_.transform(article_reader.fileids())]

all_fileids = article_reader.fileids()

for n in tqdm(preprocessor_.transform(all_fileids), total=len(all_fileids)):
    pass
# article_reader.fileids()[0]

  0%|          | 42/123105 [00:37<30:39:27,  1.12it/s]


KeyboardInterrupt: 