Original raw data repo: https://github.com/Franck-Dernoncourt/pubmed-rct

First download the raw data from [here](https://github.com/Franck-Dernoncourt/pubmed-rct/raw/master/PubMed_200k_RCT/train.7z) and unzip into the same folder where this notebook is

# Input and preproc

In [None]:
abstracts = dict()
roles = ['OBJECTIVE', 'BACKGROUND', 'METHODS', 'CONCLUSIONS', 'RESULTS', 'METHODS/DESIGN', 'DISCUSSION', 
         'TRIAL REGISTRATION', 'SUMMARY']

sentences = list()

with open(r'D:\nlpHW\project\train.txt', 'r') as fh:
    
    tmpSents = list()
    tmpId = ''
    
    for line in fh.readlines():   
        
        if line.startswith('###'):
            
            if tmpId:
                
                abstracts[tmpId] = tmpSents
                tmpId = line.strip().replace('###', '')
                tmpSents = list()
                
            else:
                
                tmpId = line.strip().replace('###', '')
                abstracts[tmpId] = None
        else:
            
            sentences.append([token.strip() for token in line.split() if token not in roles])
            tmpSents.append([token.strip() for token in line.split() if token not in roles])

In [None]:
pmids = list(abstracts.keys())

In [None]:
pmids[0]

In [None]:
len(sentences)

In [None]:
len(abstracts)

In [None]:
abstracts['24491034']

visualize the abstract length distribution

In [None]:
from matplotlib import pyplot as plt

In [None]:
absLengths = [len(s) for a,s in abstracts.items()]

In [None]:
plt.hist(absLengths)
plt.show()

In [None]:
import numpy as np

np.mean(absLengths)

# Save abstracts

In [None]:
import pickle
with open(r'abstractsCorpus.pkl', 'wb') as fh:
    pickle.dump(abstractsCorpus)

# Get the longest training data instance

In [None]:
from itertools import chain
max([len(list(chain.from_iterable(abstractsCorpus[each]))) for each in abstractsCorpus]) 

# Create a word2vec with only half the corpus; cant fit in memory otherwise, too many unique words

In [None]:
sentences = list()
for each in list(abstractsCorpus.values())[:10000]:
    for s in each:
        sentences.append(s)

In [None]:
len(sentences)

In [None]:
from tqdm import tqdm_notebook
from itertools import chain

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

sents = list()
bar = tqdm_notebook(list(enumerate(sentences)))
for i, _ in bar:
    sents.append([wordnet_lemmatizer.lemmatize(word).strip(r'.,:-?_!])').lower() for word in sentences[i]])

In [None]:
len(sents)

# Longest abstract

In [None]:
max([len(list(chain.from_iterable(abstractsCorpus[each]))) for each in list(abstractsCorpus.keys())[:10000]]) 

# The w2v on corpus (abstracts)

In [None]:
import gensim

In [None]:
model = gensim.models.Word2Vec(sents, min_count=1)
model.save('PubMed_200k_RCT_model_10000')
print('PubMed 200k RCT corpus model saved.')

In [None]:
model = gensim.models.Word2Vec.load('PubMed_200k_RCT_model_10000')

In [None]:
print(model.most_similar('result'))

# Fetching the titles for the abstracts
## (based on the pmids)

In [None]:
from Bio import Entrez

In [None]:
Entrez.email = 'd.dejan.djukic@gmail.com'

In [None]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [None]:
from tqdm import tqdm_notebook
titles = dict()
notFound = list()

bar = tqdm_notebook(list(chunks(pmids, 10000)))

for each in bar:
    
    handle = Entrez.efetch(db="pubmed", id=each, retmode = 'xml')
    try:
        rec = Entrez.read(handle)

        bar2 = tqdm_notebook(list(zip(each, [record for record in rec['PubmedArticle']])))
        for pmid, record in bar2:

            try:
                
                title = record["MedlineCitation"]["Article"]["ArticleTitle"]
                if title:
                    titles[pmid] = title
                else:
                    notFound.append(pmid)
                
            except:
                notFound.append(pmid)
    except:
        notFound.append(pmid)

    
    print(f'current size of the titles dict: {len(titles)}')
    print(f'titles not found for: {notFound}')

# Save titles

In [None]:
with open('titlesAbstracts.pkl', 'wb') as fh:
    pickle.dump(titles)