In [1]:
import pubmed_parser as pp
import pandas as pd
import nltk
nltk.download('punkt')
import numpy as np
from csv import DictWriter
import json
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



#NLP 
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_sci_lg
import string

from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vagishvela/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Get a sample of data from raw, and then ready the first file for analysis. Mostly because this computer isn't fast enough or large enough to do the whole analysis at once.

In [2]:
path_xml = pp.list_xml_path('../../data/raw') # list all xml paths under directory
pubmed_dict = pp.parse_medline_xml(path_xml[0]) # dictionary output

Let's checkout out what the data looks like.

In [3]:
df = pd.DataFrame(pubmed_dict)
df.head()

Unnamed: 0,title,issue,pages,abstract,journal,authors,pubdate,pmid,mesh_terms,publication_types,...,doi,references,delete,affiliations,pmc,other_id,medline_ta,nlm_unique_id,issn_linking,country
0,[Beta-blockers and arterial hypertension in th...,7(31),2807-9,,La Nouvelle presse medicale,Tcherdakoff|P|P|,1978,30970,D000319:Adrenergic beta-Antagonists; D005260:F...,D016428:Journal Article,...,,,False,,,,Nouv Presse Med,312552,0301-1518,France
1,[beta-blockers and high risk pregnancies. View...,7(31),2811-2,,La Nouvelle presse medicale,Dubois|D|D|;Petitcolas|J|J|,1978,30971,D000070:Acebutolol; D000319:Adrenergic beta-An...,D016428:Journal Article,...,,,False,,,,Nouv Presse Med,312552,0301-1518,France
2,[Surface defense mechanisms of the nasal mucosa].,33(35),1391-3,,"Polski tygodnik lekarski (Warsaw, Poland : 1960)",Makowska|W|W|;Zawisza|E|E|,1978,30972,D001424:Bacterial Infections; D002633:Chemotax...,D016428:Journal Article; D016454:Review,...,,,False,,,,Pol Tyg Lek,9705468,0032-3756,Poland
3,[Pharmacological treatment of coronary disease].,33(44),1721-4,,"Polski tygodnik lekarski (Warsaw, Poland : 1960)",Krotkiewski|A|A|,1978,30973,D000319:Adrenergic beta-Antagonists; D003327:C...,D016428:Journal Article,...,,,False,,,,Pol Tyg Lek,9705468,0032-3756,Poland
4,Treating the outpatient schizophrenic.,64(5),48-56,,Postgraduate medicine,Gelenberg|A J|AJ|,1978,30974,D000208:Acute Disease; D000553:Ambulatory Care...,D016428:Journal Article,...,10.1080/00325481.1978.11714969,,False,,,,Postgrad Med,401147,0032-5481,England


Looks like all the data is there, or is it?

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              30000 non-null  object
 1   issue              30000 non-null  object
 2   pages              30000 non-null  object
 3   abstract           30000 non-null  object
 4   journal            30000 non-null  object
 5   authors            30000 non-null  object
 6   pubdate            30000 non-null  object
 7   pmid               30000 non-null  object
 8   mesh_terms         30000 non-null  object
 9   publication_types  30000 non-null  object
 10  chemical_list      30000 non-null  object
 11  keywords           30000 non-null  object
 12  doi                30000 non-null  object
 13  references         30000 non-null  object
 14  delete             30000 non-null  bool  
 15  affiliations       30000 non-null  object
 16  pmc                30000 non-null  objec

Let's drop the empty strings, as this data doesn't respond to `.isnull` or `.isna` methods.

In [6]:
abstractDf = df.drop(df[df['abstract'] == ''].index)

Now we have data that has abstracts, and we can start to do some analysis.

In [47]:
abstractDf['abstract'].to_list()[0]

'The paper describes a modified method of isolating the branching enzyme of amylose isomerase from muscles and a study of the enzyme activity at different stages of purification. By enzyme fractionation on biogel R-150 and Sepharose 6B the fractions containing different RNA amounts have been isolated. The activity of fractions has been shown to depend on their content of RNA. The paper presents a procedure used to isolate a highly purified fraction of amylose isomerase and its properties (pH and temperature optima, enzyme optimal concentration and Michaelis constant).'

In [44]:
abstractDf['mesh_terms'].to_list()[0]

'D015061:1,4-alpha-Glucan Branching Enzyme; D000596:Amino Acids; D000818:Animals; D002384:Catalysis; D055598:Chemical Phenomena; D002621:Chemistry; D002850:Chromatography, Gel; D004591:Electrophoresis, Polyacrylamide Gel; D005964:Glucosyltransferases; D006863:Hydrogen-Ion Concentration; D007700:Kinetics; D008722:Methods; D009132:Muscles; D012313:RNA; D011817:Rabbits; D012685:Sepharose; D013696:Temperature'

## Stopwords Section

In [7]:
string.punctuation
stopwords = list(STOP_WORDS)

In [8]:
stopwords

['even',
 'yourselves',
 'one',
 'until',
 'eleven',
 'fifty',
 'just',
 'myself',
 'i',
 'part',
 'beforehand',
 'therefore',
 'seeming',
 'would',
 'except',
 'then',
 'towards',
 'yourself',
 'their',
 'than',
 'several',
 'them',
 'however',
 'see',
 'among',
 'me',
 'because',
 'between',
 'full',
 'fifteen',
 "'ve",
 'around',
 'moreover',
 'call',
 'for',
 'of',
 'after',
 'you',
 'alone',
 'next',
 'across',
 'whole',
 'yours',
 'amount',
 'other',
 'two',
 'whatever',
 'became',
 'in',
 'keep',
 'least',
 'each',
 'something',
 'whereby',
 "'m",
 'your',
 "'ll",
 'rather',
 'at',
 'besides',
 'further',
 'still',
 'on',
 'third',
 'who',
 'where',
 'she',
 'an',
 'a',
 'whither',
 'ever',
 'how',
 'these',
 'mostly',
 'must',
 'anyway',
 'say',
 'himself',
 'whereas',
 'every',
 'show',
 'my',
 'side',
 'beside',
 'whether',
 'almost',
 'seem',
 'has',
 'hereby',
 'whenever',
 'us',
 'they',
 'along',
 'anyone',
 'to',
 'though',
 '’d',
 're',
 'forty',
 'his',
 '‘ll',
 '‘d',


In [41]:
def tokenizer(text):
    parser = en_core_sci_lg.load()
    tokens = parser(text)
    # Remove stop words and punctuation symbols
    # tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens ]
    return tokens

In [42]:
abstract10Df = abstractDf.head(2)

In [43]:
tqdm.pandas()
abstract10Df['processed_abstract'] = abstract10Df['abstract'].apply(tokenizer)

# https://www.kaggle.com/code/maksimeren/covid-19-literature-clustering#Vectorization

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abstract10Df['processed_abstract'] = abstract10Df['abstract'].apply(tokenizer)


In [58]:
text = ''
for token in abstract10Df['processed_abstract'].values[0]:
    text += token.text + ' '
text

'The paper describes a modified method of isolating the branching enzyme of amylose isomerase from muscles and a study of the enzyme activity at different stages of purification . By enzyme fractionation on biogel R-150 and Sepharose 6B the fractions containing different RNA amounts have been isolated . The activity of fractions has been shown to depend on their content of RNA . The paper presents a procedure used to isolate a highly purified fraction of amylose isomerase and its properties ( pH and temperature optima , enzyme optimal concentration and Michaelis constant ) . '

In [29]:
abstract10Df['processed_abstract'].values.tolist()

[The paper describes a modified method of isolating the branching enzyme of amylose isomerase from muscles and a study of the enzyme activity at different stages of purification. By enzyme fractionation on biogel R-150 and Sepharose 6B the fractions containing different RNA amounts have been isolated. The activity of fractions has been shown to depend on their content of RNA. The paper presents a procedure used to isolate a highly purified fraction of amylose isomerase and its properties (pH and temperature optima, enzyme optimal concentration and Michaelis constant).,
 Although the antiparkinsonian activity of 1-prolyl-l-leucyl-glycine amide (PLG=MIF-I) has been previously observed in several clinical trials, little is known of the mechanism of action of this tripeptide on the brain. Our study demonstrated potentiation of the action of apomorphine by PLG on the rotational behavior of mature rats which received unilateral 6-OHDA (16 microgram) lesions of the striatum as neonates. No ch

## Vectorizer

In [12]:

tfidfvectorizer = TfidfVectorizer()

In [13]:
abstract10Df = abstractDf.head(10)

In [14]:
vectorized_abstracts = tfidfvectorizer.fit_transform(abstract10Df['abstract'].values)

In [18]:
vectorized_abstracts.shape

(10, 504)

In [23]:
features = tfidfvectorizer.get_feature_names_out()

In [31]:
features

array(['001', '01', '05', '10', '100', '105', '138', '14', '150', '16',
       '160', '17', '224', '23', '243', '30', '32', '324', '36', '37',
       '38', '41', '500', '57', '64', '69', '6b', '72', '76', '91', '95',
       'about', 'acid', 'action', 'activated', 'activation', 'activities',
       'activity', 'addition', 'affinity', 'after', 'agent', 'agonistic',
       'albumin', 'all', 'also', 'although', 'amacrine', 'amide', 'among',
       'amounts', 'amylose', 'an', 'analyze', 'anatomical', 'and',
       'animals', 'antibiotics', 'antibodies', 'antibody', 'antigen',
       'antiparkinsonian', 'any', 'apomorphine', 'apparent', 'apparently',
       'appears', 'are', 'as', 'assay', 'assessed', 'associated', 'at',
       'atenolol', 'auditory', 'authors', 'basic', 'be', 'beats',
       'because', 'been', 'before', 'behavior', 'below', 'betablocking',
       'binding', 'biochemical', 'biogel', 'blood', 'bound', 'brain',
       'brainstem', 'branching', 'but', 'by', 'ca2', 'calcium',
  