##### The notebook conisits for all steps needed to preprocess the Nature Dataset to perform frequent itemset mining of the citations

In [1]:
import scispacy
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle as pkl
import zipfile
import os
import xmltodict
import json
import zlib
import seaborn as sns
import resource
import sys
import regex as re
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
import nltk.data
import time
import json
from multiprocessing import Pool
from nltk.tokenize import word_tokenize
from numpy.testing import assert_array_equal
import threading
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from mlxtend.preprocessing import TransactionEncoder
from datasketch import MinHash, MinHashLSH
#nltk.download('punkt')
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
def get_reference_mapping(filename, content):
    
    """
    Given XML filename and XML file, extract rid mappings and attribute data
    """
    
    mappings  = {}
    extracted = {}
    references = {}
    parsed    = xmltodict.parse(content.decode('UTF-8'))
    soup      = BeautifulSoup(content)

    if '@id' in parsed['article']:
        extracted['id']       = str(parsed['article']['@id'])
    if '@language' in parsed['article']:
        extracted['language'] = str(parsed['article']['@language'])
    if '@publish' in parsed['article']:
        extracted['publish']  = str(parsed['article']['@publish'])
    if '@relation' in parsed['article']:
        extracted['relation'] = str(parsed['article']['@relation'])
    
    if ('article' in parsed):
        if ('pubfm' in parsed['article']):
            if ('jtl' in parsed['article']['pubfm']):
                extracted['jtl']   = str(parsed['article']['pubfm']['jtl'])
    
    if ('article' in parsed):
        if ('pubfm' in parsed['article']):
            if ('vol' in parsed['article']['pubfm']):
                extracted['vol']   = str(parsed['article']['pubfm']['vol'])
    
    if ('article' in parsed):
        if ('pubfm' in parsed['article']):
            if ('issue' in parsed['article']['pubfm']):
                extracted['issue'] = str(parsed['article']['pubfm']['issue'])

    if ('article' in parsed):
        if ('pubfm' in parsed['article']):
            if ('vol' in parsed['article']['pubfm']):
                extracted['doi']   = str(parsed['article']['pubfm']['doi'])
    
    if ('article' in parsed):
        if ('fm' in parsed['article']):
            if ('atl' in parsed['article']['fm']):
                extracted['title']   = str(parsed['article']['fm']['atl'])
     
    
    del parsed
    
    for bib in soup.find_all("bib"):
        
        try:
            reference_attr = {}
            
            reference_attr['title']   = str(bib.atl.contents[0])
            reference_attr['snm']     = str([i.contents[0] for i in bib.find_all('snm')])
            reference_attr['fnm']     = str([i.contents[0] for i in bib.find_all('fnm')])
            reference_attr['journal'] = str(bib.jtl.contents[0])
            reference_attr['year']    = str(bib.find_all('cd')[0].contents[0])

            references.update({bib.attrs['id']: reference_attr})
        
        except Exception as e:
            
            if DEBUG == True:
            
                print('='*50)
                print('Something is wrong with BeatifulSoup Tags: %s' % str(bib))
                for i in ['snm', 'fnm', 'jtl', 'year', 'atl']:
                    if len(bib.find_all(i)) == 0:
                        print('%s attribute is missing.' %i)
            
            else: pass
            
    
    references = {'metadata': extracted, 'references': references}
    
    try:
        with open('data/intermediate/references/%s' % (filename + '.json'), 'w') as f:
            f.write(json.dumps(references))
        return True
    
    except TypeError as e:
        #print('Some contents of the file %s is not serializable' % filename)
        raise e 
        
def get_reference_text(filename, content):
    
    """
    Given XML filename and XML file, extract referencing text and reference metadata
    TODO: Compile all regex to make it faster
    
    Returns  {str(unique_paper_indentifier), list(preceeding_text)}
    """
    
    content   = content.decode('UTF-8')
    bibid     = re.findall(r'<bibr\srid=\"(.*?)\"\s*\/>', content)
    #bibtext   = re.findall(r"\s.*?<bibr\s", content)
    
    bibtext_intermediate = [s for s in sent_detector.tokenize(content)\
                               if re.search(r'<bibr\srid', s)]

    bibtext_intermediate = [s.split('</p>') for s in bibtext_intermediate]
    bibtext_intermediate = [item for sublist in bibtext_intermediate for item in sublist]

    bibtext_intermediate = [s for s in bibtext_intermediate if re.search(r'<bibr\srid', s)]
    text_counter         = [len(re.findall(r'<bibr\srid', s)) for s in bibtext_intermediate]

    bibtext = []

    for i, s in enumerate(bibtext_intermediate):
        while text_counter[i] != 0:
            bibtext.append(s)
            text_counter[i] -= 1
        
    
    assert(len(bibid) == len(bibtext)), "The bibid's and preceeding text don't match: " +\
                                        "for article %s\n" % filename +\
                                        "bibid: %s\n" %bibid +\
                                        "bibtext: %s\n" %bibtext
    
    with open('data/intermediate/tex_ref_mappings/%s' % (filename + '.json'), 'w') as f:
        f.write(json.dumps([{i[1]:i[0].split(' ')} for i in zip(bibid, bibtext)]))
    
    return 


def get_zips():
    
    """
    Function to extract relevant files from the filebase
    """
    
    citations = {}
    directory = 'data/nature/raw xml/'
    zip_list  = [i for i in os.listdir(directory) if 'supp_xml' not in i]
    
    errored   = []
    #zfile     = zipfile.ZipFile(os.path.join(directory, zip_list[0]))
    
    for zfile in tqdm(zip_list):
        
        zfile     = zipfile.ZipFile(os.path.join(directory, zfile))

        for finfo in zfile.infolist():

            if 'nature' in finfo.filename:

                try:
                    ifile = zfile.open(finfo)
                    content = ifile.read()
                    tex_ref_map = get_reference_text(finfo.filename, content)
                    ref_id_ref_map = get_reference_mapping(finfo.filename, content)

                    citations[finfo.filename] = {'tex_ref_map': tex_ref_map,
                                                'ref_id_ref_map': ref_id_ref_map}
                except Exception as e:

                    print('Filename: %s, Zip:%s, Content:%s' % (finfo.filename,
                                                                zfile,
                                                                content))
                    errored.append({'Filename':finfo.filename,
                                    'Zip':zfile ,
                                    'Content': content})
                    #return content
                    #raise Exception
            
    return citations, errored
        
    
def get_zips_parallel_mapper(directory):
    
    """
    Function to extract relevant files from the filebase
    """
    
    try:
        
        zip_list  = [i for i in os.listdir(directory) if 'supp_xml' not in i]
        errored   = []
        finfos    = []
        contents  = []

        for zfile in tqdm(zip_list):

            zfile     = zipfile.ZipFile(os.path.join(directory, zfile))

            for finfo in zfile.infolist():

                if 'nature' in finfo.filename:
                    
                    ifile = zfile.open(finfo)
                    content = ifile.read()

                    finfos.append(finfo)
                    contents.append(zlib.compress(content))

        return finfos, contents
    
    except Exception as e:
        
        print('Could not read file from zip %s, file %s' % (zfile, finfo.name))

        
        
def get_zips_parallel_reducer(arg):
    
    try:
        finfo, content = arg[0], zlib.decompress(arg[1])
        del arg

        #citations = {}
        #tex_ref_map = get_reference_text(finfo.filename, content)
        #ref_id_ref_map = get_reference_mapping(finfo.filename, content)
        #citations[finfo.filename] = {'tex_ref_map': tex_ref_map,
        #                            'ref_id_ref_map': ref_id_ref_map}
        
        get_reference_text(finfo.filename, content)
        get_reference_mapping(finfo.filename, content)
        
        return True
    
    except Exception as e:
        
        #print('Could not extract references from %s' % finfo)
        return str(Exception)
    
    

## Step 1:
    Process and map texts in an article to thier references. T
    
    Input: The Nature Dataset of XML Files.
    Output:
        - tex_ref_mappings: This file consists of sentences associated with any given citation.
        - references: A list of jsons with each item consisting of references and refereces details in the paper.
    

In [None]:
finfos, contents = get_zips_parallel_mapper(directory)
p = Pool(6)
success = p.map(get_zips_parallel_reducer, zip(finfos, contents))
error_rate = sum([1 if i == True else 0 for i in success])/len(success)
print ('%f of all processed files Succeeded' % error_rate)

## Step 2.1: 
    
    
    	Groups together all
    	referring sentences to the paper that they are referring to
    	in order to make the process easier for downstream analysis.

In [2]:
def build_ref_index(tex_ref_mappings, reference):

    inverse_tex_ref = {}
    
    for tex_refs in tex_ref_mappings:
        
        text, refs = list(tex_refs.items())[0]
        
        for ref in refs: 
            if ref in inverse_tex_ref:
                inverse_tex_ref[ref].append(text)
            else:
                inverse_tex_ref[ref] = [text]
    
    return inverse_tex_ref
    
def read_files():
    
    directory = 'data/intermediate/'
    tex_ref_mappings = []
    references       = []
    
    tex_ref_files = os.listdir(os.path.join(directory, 'tex_ref_mappings'))
    references_files = os.listdir(os.path.join(directory, 'references'))
    
    if len(tex_ref_files) != len(references_files):
        print('Not all text files have a references counterpart.\n'
              'Continuing with the files that do have a mapping')
    
    #file = tex_ref_files[1]
    
    for file in tqdm(tex_ref_files):
        #print(file)
        #print(tex_ref_mappings)
        
        with open(os.path.join(directory, 'tex_ref_mappings', file), 'r', encoding = 'UTF-8') as f0:

            try:
                with open(os.path.join(directory, 'references', file), 'r',  encoding = 'UTF-8') as f1:

                    tex_ref_mappings.append(json.loads(str(f0.read()),  encoding = 'UTF-8'))
                    references.append(json.loads(str(f1.read()),  encoding = 'UTF-8'))

            except IOError as e:
                print('References file %f not found.' % file)
    
    return tex_ref_mappings, references


def get_reference_map_0(nature2references, nature_references_info, doi):
    
    try:
        article_id = nature_references_info[nature_references_info.index == doi]['ArticleID'].values[0]
    
        references = nature2references[nature2references['CitingArticleID'] == article_id]
    
        references = pd.merge(references, nature_references_info,
                              how = 'left', left_on = 'CitedArticleID', right_on = 'ArticleID')
    
    except Exception as e:
        
        print('doi: %s' % doi)
        print(e)
        references = pd.DataFrame()
    
    return references

def get_reference_map_1(get_reference_map_1_input): 
    
    """
    NOTE: Improvements Todo: Force 1-1 mapping, add year and journal to increase map accuracy
    """
    
    reference, tex_ref_mapping = get_reference_map_1_input
    
    if 'doi' in reference['metadata']:
        doi = reference['metadata']['doi']

        references_0 = get_reference_map_0(nature2references, nature_references_info, doi).fillna('')

        references_1 = pd.DataFrame.from_dict(reference['references'], orient = 'index').fillna('')

        if (len(references_1) > 0) and (len(references_0) > 0):

            references_1['text'] = references_1.index.map(build_ref_index(tex_ref_mapping, reference))

            vect = TfidfVectorizer(min_df=1, stop_words="english")
            tfidf = vect.fit(list(references_0['Title']) + list(references_1['title']))

            vect_0 = vect.transform(list(references_0['Title']))
            vect_1 = vect.transform(list(references_1['title']))

            pairwise_similarity = vect_0 * vect_1.T 

            references_0['match_index'] = [i[0] for i in np.argmax(pairwise_similarity, axis = 1).tolist()]

            test_match = pd.merge(references_0,
                                  references_1.reset_index(),
                                  left_on = 'match_index',
                                  right_index = True, how = 'left').rename(columns = {'Title': 'title0',
                                                                                     'title': 'title1'})

            return test_match

    else:

        return pd.DataFrame()
    
def flatten(l):
    flat_list = []
    for sublist in l:
        if type(sublist) == list:
            for item in sublist:
                flat_list.append(item)
    return flat_list



In [None]:
tex_ref_mappings, references = read_files()

nature_references_info = pd.read_hdf('data/nature/preprocessed/NatureReferencesInformation.hdf').set_index('DOI')
nature2references      = pd.read_hdf('data/nature/preprocessed/Nature2References.hdf')

mp_input = zip(references, tex_ref_mappings)
p        = Pool(8)

matched_dfs = p.map(get_reference_map_1, mp_input)
matched_dfs = pd.concat(matched_dfs)
matched_dfs = pd.DataFrame(matched_dfs.groupby('ArticleID')\
                                 .apply(lambda x: list(x['text'].values))).rename(columns = {0: 'text'})

matched_dfs['clean_text'] = matched_dfs['text'].apply(flatten)
matched_dfs['clean_text'] = matched_dfs['clean_text'].apply(lambda x: [BeautifulSoup(i, "lxml").text for i in x])
matched_dfs['count'] = matched_dfs['clean_text'].apply(lambda x: len(x))
matched_dfs = matched_dfs[matched_dfs['count'] != 0]
matched_dfs.to_csv('data/intermediate/matched_dfs.csv')

## Step 2.2:
    
    Create a file of new line seperated jsons with each json consisting of all parsed 
	information of a given paper. 
    
    NOTE: This module overrides some of the functions decribed above

In [None]:
def get_text_mapping(filename, content):
    
    """
    Given XML filename and XML file, extract rid mappings and attribute data
    """
    
    mappings  = {}
    extracted = {}
    references = {}
    parsed    = xmltodict.parse(content.decode('UTF-8'))
    soup      = BeautifulSoup(content)

    if '@id' in parsed['article']:
        extracted['id']       = str(parsed['article']['@id'])
    if '@language' in parsed['article']:
        extracted['language'] = str(parsed['article']['@language'])
    if '@publish' in parsed['article']:
        extracted['publish']  = str(parsed['article']['@publish'])
    if '@relation' in parsed['article']:
        extracted['relation'] = str(parsed['article']['@relation'])
    
    if ('article' in parsed):
        if ('pubfm' in parsed['article']):
            if ('jtl' in parsed['article']['pubfm']):
                extracted['jtl']   = str(parsed['article']['pubfm']['jtl'])
    
    if ('article' in parsed):
        if ('pubfm' in parsed['article']):
            if ('vol' in parsed['article']['pubfm']):
                extracted['vol']   = str(parsed['article']['pubfm']['vol'])
    
    if ('article' in parsed):
        if ('pubfm' in parsed['article']):
            if ('issue' in parsed['article']['pubfm']):
                extracted['issue'] = str(parsed['article']['pubfm']['issue'])

    if ('article' in parsed):
        if ('pubfm' in parsed['article']):
            if ('vol' in parsed['article']['pubfm']):
                extracted['doi']   = str(parsed['article']['pubfm']['doi'])
    
    if ('article' in parsed):
        if ('fm' in parsed['article']):
            if ('atl' in parsed['article']['fm']):
                extracted['title']   = str(parsed['article']['fm']['atl'])
     
    
    del parsed
    
    #print(len(soup.find_all("bdy")))
    
    text = ''
    
    for bib in soup.find_all("bdy"):
        
        #print(bib.text)
        
        text += bib.text
            
    
    references = {'metadata': papers[extracted['doi']], 'text': text}
    
    try:
        with open('data/intermediate/full_text_mappings/%s' % (filename + '.json'), 'w') as f:
            f.write(json.dumps(references))
        return True
    
    except TypeError as e:
        #print('Some contents of the file %s is not serializable' % filename)
        raise e
        
        
def get_zips_parallel_reducer(arg):
    
    try:
        finfo, content = arg[0], zlib.decompress(arg[1])
        del arg

        get_text_mapping(finfo.filename, content)
        
        return True
    
    except Exception as e:
        
        print('Could not extract references from %s' % finfo)
        return str(Exception)

    
def get_zips_parallel_mapper(directory):
    
    """
    Function to extract relevant files from the filebase
    """
    
    try:
        
        zip_list  = [i for i in os.listdir(directory) if 'supp_xml' not in i]
        errored   = []
        finfos    = []
        contents  = []

        for zfile in tqdm(zip_list):

            zfile     = zipfile.ZipFile(os.path.join(directory, zfile))

            for finfo in zfile.infolist():

                if 'nature' in finfo.filename:
                    
                    ifile = zfile.open(finfo)
                    content = ifile.read()

                    finfos.append(finfo)
                    contents.append(zlib.compress(content))

        return finfos, contents
    
    except Exception as e:
        
        print('Could not read file from zip %s, file %s' % (zfile, finfo.name))


def onur_json(tex_ref_mappings, references):
    
    references_w_text = references.copy()
    
    for i in tqdm(range(len(references_w_text))):
        
        paper = tex_ref_mappings[i]

        #print (paper)
        for tex_ref_mapping in paper:

            #print(tex_ref_mapping.values())

            for ref_id in list(tex_ref_mapping.values())[0]:

                if 'references' in references_w_text[i]:
                    
                    if ref_id in references_w_text[i]['references']:

                        references_w_text[i]['references'][ref_id].update({'text' :list(tex_ref_mapping.keys())[0]})
                        
                    else:
                        references_w_text[i]['references'][ref_id] = {'text' :list(tex_ref_mapping.keys())[0]}

    return references

def get_zips_parallel_reducer(arg):
    
    try:
        finfo, content = arg[0], zlib.decompress(arg[1])
        del arg
        
        get_reference_text(finfo.filename, content)
        get_reference_mapping(finfo.filename, content)
        
        return True
    
    except Exception as e:
        
        #print('Could not extract references from %s' % finfo)
        return str(Exception)
    
    
def flatten(l):
    flat_list = []
    for sublist in l:
        if type(sublist) == list:
            for item in sublist:
                flat_list.append(item)
    return flat_list


def combine_jsons(directory = 'data/intermediate/full_text_mappings',
                  target = 'data/intermediate/matched_full_text.jsons'):

    
    with open (target, 'w') as f_to:
        for file in tqdm(os.listdir(directory)):
            with open(os.path.join(directory, file), 'r') as f_from:
                f_to.write(f_from.read() + '\n')
                
            
    
    return True

In [None]:
nature_references_info['DOI'] = nature_references_info.index
nature_references_info.head()

papers = {k: g.to_dict(orient='records') for k, g in matched_dfs.groupby(level=0)}

In [None]:
for reference in references: 
    
    if 'doi' in reference['metadata']:
        
        if reference['metadata']['doi'] in papers:
            
            papers[reference['metadata']['doi']] = {'metadata' : reference['metadata'],
                                                'references': papers[reference['metadata']['doi']]}

In [None]:
%%time

mp_input = zip(references, tex_ref_mappings)
p        = Pool(8)

matched_dfs = p.map(get_reference_map_onur, mp_input)

In [None]:
with open('data/intermediate/matched.jsons', 'w') as f:

        for index, paper in papers.items():

            f.write('\n')
            f.write(json.dumps({index:paper}))

In [None]:
%time
finfos, contents = get_zips_parallel_mapper(directory)
p = Pool(7)
success = p.map(get_zips_parallel_reducer, zip(finfos, contents))
error_rate = sum([1 if i == True else 0 for i in success])/len(success)
print ('%f of all processed files Succeeded' % error_rate)

In [None]:
combine_jsons()