In [None]:
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

# DOC_PATTERN = r'.*\.json'

# corpus = PlaintextCorpusReader(f'./data/documents_json/', DOC_PATTERN)

# # corpus.fileids()

In [None]:
from langdetect import detect
from langdetect import DetectorFactory

def detect_article_lang(article):
    text = '\n'.join(article['body_text']).split(" ")
    # print(text[:50])

    lang = "en"
    try:
        if len(text) > 50:
            lang = detect(" ".join(text[:50]))
        elif len(text) > 0:
            lang = detect(" ".join(text[:]))
    # ught... beginning of the document was not in a good format
    except Exception as e:
        all_words = set(text)
        try:
            lang = detect(" ".join(all_words))
        # what!! :( let's see if we can find any text in abstract...
        except Exception as e:
            
            try:
                # let's try to label it through the abstract then
                lang = detect(' '.join(article['abstract_summary']))
            except Exception as e:
                lang = "unknown"
                pass
    
    return lang


In [None]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [None]:
import string
import spacy
import scispacy
import en_core_sci_lg
from spacy.lang.en.stop_words import STOP_WORDS

punctuations = string.punctuation
stopwords = list(STOP_WORDS)
print(stopwords[:10])

custom_stop_words = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 
    'al.', 'Elsevier', 'PMC', 'CZI', 'www'
]

for w in custom_stop_words:
    if w not in stopwords:
        stopwords.append(w)

# Parser
parser = en_core_sci_lg.load(disable=["tagger", "ner"])
parser.max_length = 7000000

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    # mytokens = " ".join([i for i in mytokens])
    return mytokens

In [None]:
import codecs
import json
import pandas as pd

from nltk import sent_tokenize

DOC_PATTERN = r'.*\.json'

class ArticleCorpusReader(PlaintextCorpusReader, CorpusReader):
    def __init__(self, root, fileids=DOC_PATTERN, **kwargs):
        if any(key.startswith('metadata_path') for key in kwargs.keys()):
            self.metadata_path = kwargs['metadata_path']
            self.df_metadata = pd.read_csv(self.metadata_path, dtype={
                'pubmed_id': str,
                'Microsoft Academic Paper ID': str, 
                'doi': str
            })
            # print('has key')
        # print(self.df_metadata)

        PlaintextCorpusReader.__init__(self, root, fileids, kwargs)
        CorpusReader.__init__(self, root, fileids, kwargs)
    
    def docs(self, fileids=None):
        for path, encoding in self.abspaths(fileids, include_encoding=True):
            with codecs.open(path, 'r', encoding=encoding) as file:
                content = json.load(file)
                dict_ = {'paper_id': None, 'title': None, 'abstract': None, 'body_text': None}

                dict_['paper_id'] = content['paper_id']
                dict_['title'] = 'Not provided.'
                dict_['abstract'] = []
                dict_['body_text'] = []
                # Abstract
                for entry in content['abstract']:
                    dict_['abstract'].append(entry['text'])
                # Body text
                for entry in content['body_text']:
                    dict_['body_text'].append(entry['text'])
                
                if content['metadata']['title'] != "":
                    dict_['title'] = content['metadata']['title']

                # dict_['abstract'] = '\n\n'.join(dict_['abstract'])
                # dict_['body_text'] = '\n\n'.join(dict_['body_text'])

                yield dict_

    def metadata(self, fileids=None):
        if fileids == None:
            # not working
            yield None
        else:
            for doc in self.docs(fileids):
                # get metadata infomation
                self.df_metadata = self.df_metadata.loc[self.df_metadata['sha'] == doc['paper_id']]

                if len(self.df_metadata) == 0:
                    yield None

                yield self.df_metadata

    def articles(self, fileids=None):
        if fileids == None:
            yield None
        else:
            for doc in self.docs(fileids):
                dict_ = {'paper_id': None, 'doi': None, 'abstract': None, 'body_text': None, 'authors': [], 'title': None, 'journal': None, 'abstract_summary': None,
                'abstract_word_count': 0, 'body_word_count': 0, 'body_unique_words': 0}

                dict_['abstract'] = doc['abstract']
                dict_['paper_id'] = doc['paper_id']
                dict_['body_text'] = doc['body_text']

                abstract_text = '\n'.join(doc['abstract'])

                # also create a column for the summary of abstract to be used in a plot
                if len(doc['abstract']) == 0:
                    # no abstract provided
                    dict_['abstract_summary'] = ["Not provided."]
                elif len(abstract_text.split(' ')) > 100:
                    # abstract provided is too long for plot, take first 100 words append with ...
                    info = abstract_text.split(' ')[:100]
                    summary = get_breaks(' '.join(info), 40)
                    dict_['abstract_summary'] = summary + "..."
                else:
                    # abstract is short enough
                    summary = get_breaks(abstract_text, 40)
                    dict_['abstract_summary'] = summary
                
                try:
                    # if more than one author
                    authors = self.df_metadata['authors'].values[0].split(';')
                    if len(authors) > 2:
                        # if more than 2 authors, take them all with html tag breaks in between
                        dict_['authors'].append(get_breaks('. '.join(authors), 40))
                    else:
                        # authors will fit in plot
                        dict_['authors'].append(". ".join(authors))
                except Exception as e:
                    # if only one author - or Null value
                    dict_['authors'].append(self.df_metadata.loc['authors'].values[0])['title']

                dict_['title'] = doc['title']
           
                # get word counts
                dict_['abstract_word_count'] = len('\n'.join(dict_['abstract']).strip().split())
                dict_['body_word_count'] = len('\n'.join(dict_['body_text']).strip().split())
                dict_['body_unique_words'] = len(set('\n'.join(dict_['body_text']).split()))

                dict_['lang'] = detect_article_lang(dict_);

                yield dict_       

    def sizes(self, fileids=None):
        for path in self.abspaths(fileids):
            yield os.path.getsize(path)

    def paras(self, fileids=None):
        for article in self.articles(fileids):
            for paragraph in article['body_text']:
                yield paragraph
    
    def sents(self, fileids=None):
        for paragraph in self.paras(fileids):
            for sentence in sent_tokenize(paragraph):
                yield sentence

    def words(self, file2000ids=None):
        for sentence in self.sents(fileids):
            for word_tok in spacy_tokenizer(sentence):
                yield word_tok

In [None]:
article_reader = ArticleCorpusReader(f'./data/documents_json/', metadata_path=f'./data/metadata.csv')

In [None]:
# self.df_metadata = None
# [print(article) for article in article_reader.metadata()]
# [str(article) for article in article_reader.docs(article_reader.fileids()[0])]
# [str(article) for article in article_reader.metadata(article_reader.fileids()[0])]
# [str(article) for article in article_reader.articles(article_reader.fileids()[0])]
# [str(article) + ' kbs' for article in article_reader.sizes(article_reader.fileids()[0])]
# [str(article) for article in article_reader.paras(article_reader.fileids()[0])]
# [str(article) for article in article_reader.articles(article_reader.fileids()[0])]
# [str(article) for article in article_reader.sents(article_reader.fileids()[0])]
# [str(article) for article in article_reader.words(article_reader.fileids()[0])]
next(article_reader.articles(article_reader.fileids()[0]))

In [None]:
spacy_tokenizer('Just a simple sentence here.')

In [None]:
import os
import pickle

class Preprocessor(object):
    def __init__(self, corpus, target=None, **kwargs):
        self.corpus = corpus
        self.target = target

    def fileids(self, fileids=None):
        if fileids != None:
            return fileids
        return self.corpus.fileids()
    
    def abspath(self, fileid):
        parent = os.path.relpath(os.path.dirname(article_reader.abspath(fileid)), article_reader.root)

        basename =  os.path.basename(fileid)
        name, ext = os.path.splitext(basename)

        basename = name + '.pkl'

        return os.path.normpath(os.path.join(self.target, parent, basename))
    
    def tokenize_article(self, article):
        for paragraph in article['body_text']:
            yield [ spacy_tokenizer(sent) for sent in sent_tokenize(paragraph) ]

    def process(self, fileid):
        target = self.abspath(fileid)
        parent = os.path.dirname(target)

        if not os.path.exists(parent):
            os.makedirs(parent)

        if os.path.exists(target):
            return None
        
        if not os.path.isdir(parent):
            raise ValueError("document path in not a directory")
        
        current_article = next(self.corpus.articles(fileid))
        # print(current_article['title'])
        if detect_article_lang(current_article) == 'en':
            document = { 'title': current_article['title'], 'doc': list(self.tokenize_article(current_article))}

            with open(target, 'wb') as f:
                pickle.dump(document, f, pickle.HIGHEST_PROTOCOL)

            # del document

            return target

        return None


    def transform(self, fileids=None):
        if not os.path.exists(self.target):
            os.makedirs(self.target)

        for fileid in self.fileids(fileids):
            yield self.process(fileid)

In [None]:
from tqdm import tqdm
from random import sample 

# instantiate preprocessor
preprocessor_ = Preprocessor(article_reader, f'./data/pickled/')

# sample files
all_fileids = article_reader.fileids()
sampled_fileids = sample(all_fileids, 3000)

# apply preprocessor on sampled files using tqdm
# for n in tqdm(preprocessor_.transform(sampled_fileids), total=len(sampled_fileids)):
#     pass


In [None]:
# print(article_reader.abspath('0000028b5cc154f68b8a269f6578f21e31f62977.json'))
# next(preprocessor_.transform(['0000028b5cc154f68b8a269f6578f21e31f62977.json']))
# os.path.relpath(os.path.dirname(article_reader.abspath(article_reader.fileids()[0])), article_reader.root)
# article_reader.abspath('0000028b5cc154f68b8a269f6578f21e31f62977.json')
# [str(article) for article in preprocessor_.transform(article_reader.fileids())]
# article_reader.fileids()[0]

In [None]:
import pickle

PKL_PATTERN = r'.*\.pkl'

class PickledCorpusReader(ArticleCorpusReader):

    # constructor
    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):        
        # initialize upper classes
        ArticleCorpusReader.__init__(self, root, fileids, kwargs=kwargs)
        CorpusReader.__init__(self, root, fileids)

    def pickled_file(self, fileids):
        for path in self.abspaths(fileids):
            with open(path, 'rb') as file:
                yield pickle.load(file)

    def docs(self, fileids=None):
        # for each file in fileids open and yield handle
        for pkl_file in self.pickled_file(fileids):
            yield pkl_file['doc']
    
    # yields paragraphs
    def paras(self, fileids=None):
        for doc in self.docs(fileids):
            for para in doc:
                yield para

    # yields sentences
    def sents(self, fileids=None):
        for para in self.paras(fileids):
            for sent in para:
                yield sent
    
    # yields words
    def words(self, fileids=None):
        for sent in self.sents(fileids):
            for word in sent:
                yield word


In [None]:
import sys
sys.path.append(f'./../knn-cosine-similarity/')

import extended_similarities as sims
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from random import sample 

class CorpusProcessor(object):
    def __init__(self, root, **kwargs):
        self.corpus_fileids = None
        self.pkl_corpus_reader = PickledCorpusReader(root)

        # if sample number is chosen
        if any(key.startswith('n_samples') for key in kwargs.keys()):
            # initialize corpus with given size
            n_samples = int(kwargs['n_samples'])
            all_fileids = self.pkl_corpus_reader.fileids()
            corpus_size = len(all_fileids)

            # if given size is smaller than corpus total size
            if n_samples <= corpus_size:
                # sample from it
                self.corpus_fileids = sample(all_fileids, n_samples)
            else:
                # else, raise error
                raise ValueError("Number of samples is higher than corpus size: " + str(corpus_size))


    def fileids():
        return self.pkl_corpus_reader.fileids()

    def iloc(self, idx):
        # locate doc from index in fileids array
        return next(self.pkl_corpus_reader.pickled_file([self.corpus_fileids[idx]]))

    def vectorize(self):
        docs_text = []
        self.vectorizer = TfidfVectorizer(max_features=2 ** 14)

        # for each id in given file ids
        for current_fileid in self.corpus_fileids:
            # get all words in current doc
            doc_words = self.pkl_corpus_reader.words(current_fileid)
            # append to list of doc texts
            docs_text.append(' '.join(doc_words))

        X = self.vectorizer.fit_transform(docs_text)

        # return vectorized docs
        return X

    def pca_vectors(self):
        # initialize pca
        self.pca = PCA(n_components=0.95, random_state=42)
        # get vector representation of docs
        X = self.vectorize()
        # reduce dimensionality of given vectors
        X_reduced = self.pca.fit_transform(X.toarray())
        # return reduced vectors
        return X_reduced

    def knn_vectors(self):
        # initialize knn
        self.knn = NearestNeighbors(n_neighbors=2, algorithm='ball_tree', metric='minkowski')
        # get reduced vectors
        X_reduced = self.pca_vectors()
        # get neighbors
        neighbors = self.knn.fit(X_reduced)

        return neighbors
    
    def cosine_knn_vectors(self):
        # initialize cosine knn
        self.cos_knn = sims.DistributedCosineKnn(k=3)
        # get reduced vectors
        X_reduced = self.pca_vectors()
        # get neighbors
        indices, distances = self.cos_knn.fit(input_data=X_reduced, n_bucket=7)

        return (indices, distances)

    def search(self, query):
        # tokenize
        query_tok = spacy_tokenizer(query)
        # vectorize
        query_x = self.vectorizer.transform(query_tok)
        # reduce dimensionality
        query_reduced_x = self.pca.transform(query_x.toarray())
        # get nearest neighbors
        distances, indices = self.knn.kneighbors(query_reduced_x, n_neighbors=5)

        # show results
        print("Tokens: ")
        print(query_tok)
        print(query_x)
        print('\nResults: ')
        for idx in indices[0]:
            print('\t' + self.iloc(idx)['title'] + '\n')

        return distances, indices


In [None]:
corpus_processor = CorpusProcessor(f'./data/pickled/', n_samples=6000)

indices = corpus_processor.knn_vectors()
# distances, indices = corpus_processor.cosine_knn_vectors()

In [None]:
# search_query = "neurological impact of the disease."
# search_query = "symptoms of covid."
# search_query = "politics of the lockdown during the pandemic."
search_query = "infection vectors of the virus."

distances, indices = corpus_processor.search(search_query)

In [None]:
# X_reduced = corpus_processor.pca_vectors()

# print(X_reduced[258])
# query_tok = spacy_tokenizer(search_query)
# query_x = corpus_processor.vectorizer.transform(query_tok)
# query_reduced_x = corpus_processor.pca.transform(query_x.toarray())
# print(query_tok)
# print(query_x)
# print(indices)