## Setup requirements

In [3]:
import pubmed_parser as pp
import pandas as pd
import nltk
nltk.download('punkt')
import numpy as np
from csv import DictWriter
import json
import math

[nltk_data] Downloading package punkt to /home/gitpod/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load the data

In [4]:
path_xml = pp.list_xml_path('../../data/raw') # list all xml paths under directory

Get the first set of data from the paths

In [5]:
pubmed_dict = pp.parse_medline_xml(path_xml[0]) # dictionary output

## Methods for analysis

In [6]:
def get_bag_of_words_from_corpus(corpus, stop_words=[], stemming=False):
    sno_stemmer = nltk.stem.SnowballStemmer('english')
    bag_of_words = []
    for doc in corpus:
        docWords = []
        for term in nltk.word_tokenize(doc['abstract']):
            if term.lower() not in stop_words:
                if stemming:
                    docWords.append(sno_stemmer.stem(term))
                else:
                    docWords.append(term)
        bag_of_words.append(
            {
                'nlm_unique_id': doc['nlm_unique_id'],
                'bag_of_words': docWords
            }
        )
    return bag_of_words

def get_all_terms_from_corpus(bag_of_words_corpus):
    all_terms = set()

    for doc in bag_of_words_corpus:
        all_terms.update(doc['bag_of_words'])
    return list(all_terms)

# This conversion to a dataframe causes some issues as the array isn't going through properly.
def write_dict_to_json(dict, file_name):
    f = open('../../data/processed/' + file_name + '.json', "w")
    json.dump(dict, f)
    f.close()

def read_dict_from_json(file_name):
    with open('../../data/processed/' + file_name + '.json') as json_file:
        return json.load(json_file)

def create_frequency_dict(all_terms, values):
    return dict(zip(all_terms, [values for _ in all_terms]))

def get_doc_frequencies(bag_of_words_corpus, all_terms):
    documents_frequencies = create_frequency_dict(all_terms, 0)

    for doc in bag_of_words_corpus:
        uniq_tokens = set(doc['bag_of_words'])
        for uniq_token in list(uniq_tokens):
            documents_frequencies[uniq_token] += 1

    return documents_frequencies

def get_term_frequencies(bag_of_words_corpus, terms):
    terms_frequencies = create_frequency_dict(terms, [])

    for doc in bag_of_words_corpus:
        uniq_tokens = set(doc['bag_of_words'])
        for uniq_token in uniq_tokens:
            frequency = { 'nlm_unique_id': doc['nlm_unique_id'], 'freq': doc['bag_of_words'].count(uniq_token) }
            if terms_frequencies[uniq_token]:
                terms_frequencies[uniq_token].append(frequency)
            else:
                terms_frequencies[uniq_token] = [frequency]

    return terms_frequencies

# Calculate the inverse document frequency
def get_idf(df, total_document_count):
    return math.log10(total_document_count/df)

# Calculate the tf-idf weighting
def get_tf_idf(tf, idf):
    return tf * idf

def get_doc_vector_lengths(abstract_bag_of_words, doc_frequencies, term_frequencies):
    vectorDocLengths = dict(zip([_['nlm_unique_id'] for _ in abstract_bag_of_words], [0 for _ in abstract_bag_of_words]))
    totalCorpusDocCount = len(vectorDocLengths)

    for doc in abstract_bag_of_words:
        selected_doc_id = doc['nlm_unique_id']
        uniq_tokens = set(doc['bag_of_words'])
        totalWf2 = 0
        for uniq_token in list(uniq_tokens):
            idf = get_idf(doc_frequencies[uniq_token], totalCorpusDocCount)
            tf = [termFreq for termFreq in term_frequencies[uniq_token] if termFreq['nlm_unique_id'] == selected_doc_id][0]['freq']

            weightingScheme = get_tf_idf(tf, idf)
            wf2 = weightingScheme ** 2
            totalWf2 += wf2

        # Add document vector length to it's document
        vectorDocLengths[selected_doc_id] = math.sqrt(totalWf2)

    print ("Generated document vector lengths for " + str(len(vectorDocLengths)) + " documents")

    return vectorDocLengths

## Run analysis and output to json files as data is processed

In [7]:
write_dict_to_json(get_bag_of_words_from_corpus(pubmed_dict), 'abstract_bag_of_words')

In [8]:
write_dict_to_json(get_all_terms_from_corpus(read_dict_from_json('abstract_bag_of_words')), 'all_terms')

In [9]:
write_dict_to_json(get_doc_frequencies(read_dict_from_json('abstract_bag_of_words'), read_dict_from_json('all_terms')), 'doc_frequencies')

In [10]:
write_dict_to_json(get_term_frequencies(read_dict_from_json('abstract_bag_of_words'), read_dict_from_json('all_terms')), 'term_frequencies')

In [11]:
write_dict_to_json(
    get_doc_vector_lengths(
        read_dict_from_json('abstract_bag_of_words'), 
        read_dict_from_json('doc_frequencies'),
        read_dict_from_json('term_frequencies')
    ), 
    'doc_vector_lengths')

Generated document vector lengths for 2408 documents


In [18]:
# Below is what the equivalent inputs are in our case
# term_frequencies, doc_frequencies, abstract_bag_of_words, query, doc_vector_lengths, false
def cosineScore(tfPostingList, dfPostingList, terms, query, docVectorLengths, alternativeWeighting=False):
    scores = dict(zip([_['nlm_unique_id'] for _ in terms], [0 for _ in terms]))
    totalCorpusDocCount = len(scores)

    modifiedQuery = [{'nlm_unique_id': 0, 'doc': query}]
    modifiedQueryTerms = get_bag_of_words_from_corpus(modifiedQuery)
    tf_tqs = get_term_frequencies(modifiedQuery, modifiedQueryTerms)

    for queryTerm in list(modifiedQueryTerms):
        if queryTerm in terms:
            tf_tq = tf_tqs[queryTerm][0]['freq']
            if alternativeWeighting:
                w_tq = tf_tq
            else:
                w_tq = getTfIdf(tf_tq, getIdf(dfPostingList[queryTerm], totalCorpusDocCount))
            for termDoc in tfPostingList[queryTerm]:
                if alternativeWeighting:
                    wf_td = termDoc['freq']
                else:
                    wf_td = getTfIdf(termDoc['freq'], getIdf(dfPostingList[queryTerm], totalCorpusDocCount))
                scores[termDoc['nlm_unique_id']] += wf_td * w_tq

    for documentID in list(scores.keys()):
        scores[documentID] = scores[documentID] / docVectorLengths[documentID]

    sorted_score = dict(sorted(scores.items(), key=lambda elem: elem[1], reverse=True))

    return sorted_score

# Below is what the equivalent inputs are in our case
# term_frequencies, doc_frequencies, abstract_bag_of_words, query, doc_vector_lengths, false
def getDocumentResultsForQueries(tfPostingList, dfPostingList, terms, queries, docVectorLengths, alternativeWeighting=False):
    documentResultsForQueries = []
    for query in queries:
        queryCosineScore = cosineScore(tfPostingList, dfPostingList, terms, query['query'], docVectorLengths, alternativeWeighting)
        documentIDs = list(queryCosineScore.keys())

        queryDocIDs = {
            'queryID': query['queryID'],
            'docIDs': documentIDs
        }
        documentResultsForQueries.append(queryDocIDs)

        # Use for debugging or seeing the actual query results
        # writeTopKResultsToFiles(query, queryCosineScore)

    return documentResultsForQueries

In [19]:

getDocumentResultsForQueries(
    read_dict_from_json('doc_frequencies'),
    read_dict_from_json('term_frequencies'),
    read_dict_from_json('abstract_bag_of_words'), 
    [{"query": "radiotherapy"}],
    read_dict_from_json('doc_vector_lengths')
)

KeyError: 'abstract'