In [18]:
import pubmed_parser as pp
import pandas as pd
import nltk
nltk.download('punkt')
import numpy as np
from csv import DictWriter
import json
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



#NLP 
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_sci_lg
import string

from tqdm import tqdm

tqdm.pandas()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vagishvela/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Get a sample of data from raw, and then ready the first file for analysis. Mostly because this computer isn't fast enough or large enough to do the whole analysis at once.

In [19]:
path_xml = pp.list_xml_path('../../data/raw') # list all xml paths under directory
pubmed_dict = pp.parse_medline_xml(path_xml[0]) # dictionary output

Let's checkout out what the data looks like.

In [20]:
df = pd.DataFrame(pubmed_dict)
df.head()

Unnamed: 0,title,issue,pages,abstract,journal,authors,pubdate,pmid,mesh_terms,publication_types,...,doi,references,delete,affiliations,pmc,other_id,medline_ta,nlm_unique_id,issn_linking,country
0,[Beta-blockers and arterial hypertension in th...,7(31),2807-9,,La Nouvelle presse medicale,Tcherdakoff|P|P|,1978,30970,D000319:Adrenergic beta-Antagonists; D005260:F...,D016428:Journal Article,...,,,False,,,,Nouv Presse Med,312552,0301-1518,France
1,[beta-blockers and high risk pregnancies. View...,7(31),2811-2,,La Nouvelle presse medicale,Dubois|D|D|;Petitcolas|J|J|,1978,30971,D000070:Acebutolol; D000319:Adrenergic beta-An...,D016428:Journal Article,...,,,False,,,,Nouv Presse Med,312552,0301-1518,France
2,[Surface defense mechanisms of the nasal mucosa].,33(35),1391-3,,"Polski tygodnik lekarski (Warsaw, Poland : 1960)",Makowska|W|W|;Zawisza|E|E|,1978,30972,D001424:Bacterial Infections; D002633:Chemotax...,D016428:Journal Article; D016454:Review,...,,,False,,,,Pol Tyg Lek,9705468,0032-3756,Poland
3,[Pharmacological treatment of coronary disease].,33(44),1721-4,,"Polski tygodnik lekarski (Warsaw, Poland : 1960)",Krotkiewski|A|A|,1978,30973,D000319:Adrenergic beta-Antagonists; D003327:C...,D016428:Journal Article,...,,,False,,,,Pol Tyg Lek,9705468,0032-3756,Poland
4,Treating the outpatient schizophrenic.,64(5),48-56,,Postgraduate medicine,Gelenberg|A J|AJ|,1978,30974,D000208:Acute Disease; D000553:Ambulatory Care...,D016428:Journal Article,...,10.1080/00325481.1978.11714969,,False,,,,Postgrad Med,401147,0032-5481,England


Looks like all the data is there, or is it?

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              30000 non-null  object
 1   issue              30000 non-null  object
 2   pages              30000 non-null  object
 3   abstract           30000 non-null  object
 4   journal            30000 non-null  object
 5   authors            30000 non-null  object
 6   pubdate            30000 non-null  object
 7   pmid               30000 non-null  object
 8   mesh_terms         30000 non-null  object
 9   publication_types  30000 non-null  object
 10  chemical_list      30000 non-null  object
 11  keywords           30000 non-null  object
 12  doi                30000 non-null  object
 13  references         30000 non-null  object
 14  delete             30000 non-null  bool  
 15  affiliations       30000 non-null  object
 16  pmc                30000 non-null  objec

Let's drop the empty strings, as this data doesn't respond to `.isnull` or `.isna` methods.

In [22]:
abstractDf = df.drop(df[df['abstract'] == ''].index)

Now we have data that has abstracts, and we can start to do some analysis.

In [23]:
abstractDf

Unnamed: 0,title,issue,pages,abstract,journal,authors,pubdate,pmid,mesh_terms,publication_types,...,doi,references,delete,affiliations,pmc,other_id,medline_ta,nlm_unique_id,issn_linking,country
10,[Isolation and properties of preparations the ...,14(5),683-9,The paper describes a modified method of isola...,Prikladnaia biokhimiia i mikrobiologiia,Shvedova|T A|TA|;Petrova|A N|AN|,1978,30978,"D015061:1,4-alpha-Glucan Branching Enzyme; D00...",D003160:Comparative Study; D004740:English Abs...,...,,,False,,,,Prikl Biokhim Mikrobiol,0023416,0555-1099,Russia (Federation)
11,Potentiation of apomorphine action in rats by ...,9(3),375-8,Although the antiparkinsonian activity of 1-pr...,"Pharmacology, biochemistry, and behavior",Kostrzewa|R M|RM|;Kastin|A J|AJ|;Sobrian|S K|SK|,1978,30981,D000818:Animals; D001058:Apomorphine; D001522:...,D003160:Comparative Study; D016428:Journal Art...,...,10.1016/0091-3057(78)90299-x,,False,,,,Pharmacol Biochem Behav,0367050,0091-3057,United States
14,Specific immune recognition by lymphocytes: an...,53(3),225-41,"In this review, we analyze data pertinent to t...",The Quarterly review of biology,Warr|G W|GW|;Marchalonis|J J|JJ|,1978,30984,"D000220:Adaptation, Biological; D000818:Animal...",D016428:Journal Article; D013487:Research Supp...,...,10.1086/410621,,False,,,,Q Rev Biol,0376515,0033-5770,United States
19,An analysis of measured and calculated calcium...,38(7),659-67,Potentiometrically measured ionized calcium (C...,Scandinavian journal of clinical and laborator...,Pedersen|K O|KO|,1978,30989,D000293:Adolescent; D000328:Adult; D000367:Age...,D016428:Journal Article,...,10.3109/00365517809102433,,False,,,,Scand J Clin Lab Invest,0404375,0036-5513,England
20,The value of immunoelectroosmophoresis (IEOP) ...,10(3),173-6,Immunoelectroosmophoresis (IEOP) has been used...,Scandinavian journal of infectious diseases,Wiernik|A|A|;Jarstrand|C|C|;Tunevall|G|G|,1978,30990,D000900:Anti-Bacterial Agents; D000942:Antigen...,D016428:Journal Article,...,10.3109/inf.1978.10.issue-3.03,,False,,,,Scand J Infect Dis,0215333,0036-5548,England
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29964,Pregnancy hepatitis in Libya.,2(7990),827-9,The death-rate from hepatitis in pregnant wome...,"Lancet (London, England)",Christie|A B|AB|;Allam|A A|AA|;Aref|M K|MK|;Mu...,1976,61499,D000293:Adolescent; D000328:Adult; D005260:Fem...,D003160:Comparative Study; D016428:Journal Art...,...,10.1016/s0140-6736(76)91210-1,,False,,,054851; 00198778,Lancet,2985213R,0140-6736,England
29965,Oral methionine in the treatment of severe par...,2(7990),829-30,30 patients at risk of hepatic damage from par...,"Lancet (London, England)",Crome|P|P|;Vale|J A|JA|;Volans|G N|GN|;Widdop|...,1976,61500,"D000082:Acetaminophen; D000284:Administration,...",D003160:Comparative Study; D016428:Journal Art...,...,10.1016/s0140-6736(76)91211-3,,False,,,,Lancet,2985213R,0140-6736,England
29966,Integrated concentrations of catecholamines in...,2(7990),830-1,The measurement of integrated concentrations o...,"Lancet (London, England)",Daggett|P|P|,1976,61501,D000310:Adrenal Gland Neoplasms; D001794:Blood...,D016428:Journal Article,...,10.1016/s0140-6736(76)91212-5,,False,,,,Lancet,2985213R,0140-6736,England
29967,The neurology of vitamin B12 deficiency. Metab...,2(7990),832-3,The widely held view that the neurological and...,"Lancet (London, England)",Reynolds|E H|EH|,1976,61502,"D000752:Anemia, Pernicious; D005492:Folic Acid...",D016428:Journal Article,...,10.1016/s0140-6736(76)91213-7,,False,,,,Lancet,2985213R,0140-6736,England


In [24]:
string.punctuation
stopwords = list(STOP_WORDS)

In [25]:
stopwords

['except',
 'hence',
 'keep',
 'for',
 'both',
 'everywhere',
 'formerly',
 'whom',
 'became',
 'twenty',
 "'ve",
 'against',
 'under',
 'hundred',
 'below',
 'top',
 'becoming',
 'serious',
 'was',
 'seemed',
 '’re',
 'them',
 'bottom',
 'its',
 'we',
 'am',
 'too',
 'until',
 'always',
 'noone',
 'such',
 'sometime',
 'during',
 'latter',
 'thence',
 'becomes',
 'our',
 'thereupon',
 'all',
 'about',
 'n’t',
 'also',
 'fifty',
 'without',
 'ever',
 'namely',
 'were',
 'very',
 'wherever',
 'up',
 'each',
 'n‘t',
 'which',
 'first',
 'one',
 'various',
 'toward',
 'my',
 'most',
 'sixty',
 'after',
 'it',
 'may',
 'above',
 'since',
 'if',
 'used',
 'twelve',
 'regarding',
 'therein',
 'sometimes',
 'nor',
 'him',
 'from',
 'himself',
 '‘m',
 'upon',
 'either',
 'a',
 'become',
 'almost',
 'two',
 'down',
 'they',
 'me',
 'he',
 'per',
 'made',
 'you',
 'others',
 "'s",
 'through',
 'unless',
 'however',
 'wherein',
 'only',
 'four',
 'why',
 'side',
 'otherwise',
 'mostly',
 'get',
 

In [26]:
def tokenizer(text):
    parser = en_core_sci_lg.load()
    tokens = parser(text)
    tokens = [token.lemma_.lower().strip() for token in tokens if token.lemma_ != '-PRON-']
    tokens = [token for token in tokens if token not in stopwords and token not in string.punctuation]
    return tokens

In [27]:
abstractDf['processed_abstract'] = abstractDf['abstract'].apply(tokenizer)

KeyboardInterrupt: 

In [None]:
abstractDf