Original raw data repo: https://github.com/Franck-Dernoncourt/pubmed-rct

The raw data (from [here](https://github.com/Franck-Dernoncourt/pubmed-rct/raw/master/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt)) is already included as it is only 30mb

# Input and preproc

In [None]:
import string

abstracts = dict()
roles = ['OBJECTIVE', 'BACKGROUND', 'METHODS', 'CONCLUSIONS', 'RESULTS', 'METHODS/DESIGN', 'DISCUSSION', 
         'TRIAL REGISTRATION', 'SUMMARY']

sentences = list()


with open(r'train_20k_ATsigns.txt', 'r') as fh:
    
    tmpSents = list()
    tmpId = ''
    
    for line in fh.readlines():   
        
        if line.startswith('###'):
            
            if tmpId:
                
                abstracts[tmpId] = tmpSents
                tmpId = line.strip().replace('###', '')
                tmpSents = list()
                
            else:
                
                tmpId = line.strip().replace('###', '')
                abstracts[tmpId] = None
        else:
            
            sentences.append([token.strip() for token in line.split() if token not in roles and token not in set(string.punctuation).difference(set('.,'))])
            tmpSents.append([token.strip() for token in line.split() if token not in roles and token not in set(string.punctuation).difference(set('.,'))])

In [None]:
pmids = list(abstracts.keys())

In [None]:
pmids[0]

In [None]:
len(sentences)

In [None]:
len(abstracts)

visualize the abstract length distribution

In [None]:
from matplotlib import pyplot as plt

In [None]:
absLengths = [len(s) for a,s in abstracts.items()]

In [None]:
plt.hist(absLengths)
plt.show()

In [None]:
import numpy as np

np.mean(absLengths)

In [None]:
import gensim

In [None]:
from itertools import chain

# Get the longest training data instance

In [None]:
max([len(list(chain.from_iterable(abstractsCorpus[each]))) for each in abstractsCorpus]) 

# Create a word2vec with only half the corpus; cant fit in memory otherwise, too many unique words

In [None]:
sentences = list()
for each in list(abstractsCorpus.values())[:10000]:
    for s in each:
        sentences.append(s)

In [None]:
len(sentences)

In [None]:
from tqdm import tqdm_notebook
from itertools import chain

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

sents = list()
bar = tqdm_notebook(list(enumerate(sentences)))
for i, _ in bar:
    sents.append([wordnet_lemmatizer.lemmatize(word).strip(r'.,:-?_!])').lower() for word in sentences[i]])

In [None]:
len(sents)

# Longest abstract

In [None]:
max([len(list(chain.from_iterable(abstractsCorpus[each]))) for each in list(abstractsCorpus.keys())[:10000]]) 

# The w2v on corpus (abstracts)

In [None]:
import gensim

In [None]:
model = gensim.models.Word2Vec(sents, min_count=1)
model.save('PubMed_200k_RCT_model_ATsigns')
print('PubMed 200k RCT corpus model saved.')

In [None]:
model = gensim.models.Word2Vec.load('PubMed_200k_RCT_model_ATsigns')

In [None]:
print(model.most_similar('ptsd'))

# Fetching the titles for the abstracts
## (based on the pmids)

In [None]:
with open('titlesAbstracts.pkl', 'rb') as fh:
    titles = pickle.load(fh)

In [None]:
tit = set(titles.keys())
abst = set(abstractsCorpus.keys())

In [None]:
gotThese = tit.intersection(abst)

In [None]:
abstractsCorpUpdate = {pmid:abstractsCorpus[pmid] for pmid in sorted(list(gotThese))}
titlesUpdate = {pmid:titles[pmid] for pmid in sorted(list(gotThese))}

In [None]:
titlesUpdate.keys() == abstractsCorpUpdate.keys()

# Save corpuses

In [None]:
import pickle

with open('titlesAbstracts_AT.pkl', 'wb') as fh:
    pickle.dump(titlesUpdate, fh)
    
with open('abstractsCorpus_ATsigns.pkl', 'wb') as fh:
    pickle.dump(abstractsCorpUpdate, fh)