In [None]:
import pubmed_parser as pp
import pandas as pd
import nltk
import numpy as np
from csv import DictWriter
import json

In [None]:
path_xml = pp.list_xml_path('../data/raw') # list all xml paths under directory

In [None]:
pubmed_dict = pp.parse_medline_xml(path_xml[0]) # dictionary output

In [None]:
pubmed_dict

In [None]:
def get_bag_of_words_from_corpus (corpus, stop_words=[], stemming=False):
    sno_stemmer = nltk.stem.SnowballStemmer('english')
    bag_of_words = []
    for doc in corpus:
        docWords = []
        for term in nltk.word_tokenize(doc['abstract']):
            if term.lower() not in stop_words:
                if stemming:
                    docWords.append(sno_stemmer.stem(term))
                else:
                    docWords.append(term)
        bag_of_words.append(
            {
                'nlm_unique_id': doc['nlm_unique_id'],
                'bag_of_words': docWords
            }
        )
    return bag_of_words

def get_all_terms_from_corpus(bag_of_words_corpus):
    all_terms = set()

    for doc in bag_of_words_corpus:
        all_terms.update(doc['bag_of_words'])
    return list(all_terms)

# This conversion to a dataframe causes some issues as the array isn't going through properly.
def write_dict_to_json(dict, file_name):
    f = open('../data/processed/' + file_name + '.json', "w")
    json.dump(dict, f)
    f.close()

def read_dict_from_json(file_name):
    with open('../data/processed/' + file_name + '.json') as json_file:
        return json.load(json_file)

In [None]:
write_dict_to_json(get_bag_of_words_from_corpus(pubmed_dict), 'abstract_bag_of_words')

In [None]:
write_dict_to_json(get_all_terms_from_corpus(read_dict_from_json('abstract_bag_of_words')), 'all_terms')

In [None]:
def create_frequency_dict(all_terms, values):
    return dict(zip(all_terms, [values for _ in all_terms]))

def get_doc_frequencies(bag_of_words_corpus, all_terms):
    documents_frequencies = create_frequency_dict(all_terms, 0)

    for doc in bag_of_words_corpus:
        uniq_tokens = set(doc['bag_of_words'])
        for uniq_token in list(uniq_tokens):
            documents_frequencies[uniq_token] += 1

    return documents_frequencies

def get_term_frequencies(bag_of_words_corpus, terms):
    terms_frequencies = create_frequency_dict(terms, [])

    for doc in bag_of_words_corpus:
        uniq_tokens = set(doc['bag_of_words'])
        for uniq_token in uniq_tokens:
            frequency = { 'nlm_unique_id': doc['nlm_unique_id'], 'freq': doc['bag_of_words'].count(uniq_token) }
            if terms_frequencies[uniq_token]:
                terms_frequencies[uniq_token].append(frequency)
            else:
                terms_frequencies[uniq_token] = [frequency]

    return terms_frequencies

In [None]:
write_dict_to_json(get_doc_frequencies(read_dict_from_json('abstract_bag_of_words'), read_dict_from_json('all_terms')), 'doc_frequencies')

In [None]:
write_dict_to_json(get_term_frequencies(read_dict_from_json('abstract_bag_of_words'), read_dict_from_json('all_terms')), 'term_frequencies')

In [None]:
# Implement Document Vector Lengths

# Let's find the missing files abstracts

In [None]:

for path in path_xml:
  pubmed_dict = pp.parse_medline_xml(path)
  df = pd.DataFrame(pubmed_dict)
  total = len(df)
  availableDf = df.replace(r'^\s*$', np.nan, regex=True).count().to_dict()
  availableDf["total words"] = get_total_words_from_column(df["abstract"])
  availableDf["path"] = path
  availableDf["total"] = total
  print(availableDf)

  with open('00_available_data.csv', 'a') as file_obj:
    dw_obj = DictWriter(file_obj, fieldnames=availableDf.keys())
    if file_obj.tell() == 0:
      dw_obj.writeheader()
    dw_obj.writerow(availableDf)

    file_obj.close()

  #availableDf.to_csv("00_" + path.split('/')[-1] + "_available_data.csv")