In [1]:
import os
from lxml import etree
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm

### pmc_id, pub_date, pub_venue, species, all keywords psv

In [2]:
from lxml import etree
import requests

def get_root(ID):
    #print(f"Getting root for ID = {ID}")
    file_path = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id="+ID
    response = requests.get(file_path)
    root = etree.fromstring(response.content)  # Parse the XML content from the URL
    return root

def get_pub_date(root, ID):  # "root" is the parsed XML content
    pub_date = root.find(".//pub-date") # find only the first instance of <pub-date>
    if pub_date is not None:
        day = pub_date.find('day')
        month = pub_date.find('month')
        year = pub_date.find('year')
        date = ""
        for k in [day,month,year]:
            k = k.text if k is not None else "##"
            date+=k+"-"
        #print(f"Publication Date: {day}-{month}-{year}\n")
        return date[:-1]
        # return f"{day}-{month}-{year}"
    else:
        #print("Publication date not found\n")
        return "00-00-0000"

def get_journal_titles(root, ID): # "root" is the parsed XML content
    journal_title_group = root.find('.//journal-title-group')
    if journal_title_group is not None:
        values = [k.text for k in journal_title_group.findall('journal-title')] # Incase the paper id published in multiple journals
        #print(f"journal_title_group = {values}\n")
        if len(values)>1: 
            print(f"Multiple journals for ID={ID}")
        return values[0]
    else:
        #print("Publication journal not found\n")
        return "xyz"

def get_species(root, ID): # "root" is the parsed XML content
    # <kwd-group kwd-group-type="research-organism">
    species = root.find(".//kwd-group[@kwd-group-type='research-organism']")
    if species is not None:
        result = species.find('kwd').text
        return result
        # print(f"Species = {result}")
    else:
        # print("NO SPECIES FOUND")
        return np.nan

def get_keywords(root, ID): # "root" is the parsed XML content
    kwd_values = []
    for kwd_group in root.findall('.//kwd-group'):
        kwd_group_type = kwd_group.get('kwd-group-type')
        kwd_values += [kwd.text for kwd in kwd_group.findall('kwd')]
    print(kwd_values,"\n")

def find_specific_kwd_groups_and_extract_kwd_text(root, ID): 
    # "root" is the parsed XML content
    kwd_texts=[]
    
    #Iterate over all kwd-group tags
    for kwd_group in root.findall('.//kwd-group'):
        # Get the kwd-group-type attribute
        kwd_group_type = kwd_group.get('kwd-group-type', '')
        
        #Checking if 'claim' or 'evidence' is in the kwd-group-type attribute
        if "claim" not in kwd_group_type.lower() and "evidence" not in kwd_group_type.lower():
            # Extract the keywords from all <kwd> tags within this <kwd-group>
            kwd_texts+= [kwd.text for kwd in kwd_group.findall('kwd')]
            return kwd_texts
    return []

In [3]:
def get_metadata(ID):
    root = get_root(ID)
    species = get_species(root,ID)
    kwds = find_specific_kwd_groups_and_extract_kwd_text(root,ID)
    kwds = [k for k in kwds if (k and k.strip())]
    keywords = "|".join(kwds)
    journal = get_journal_titles(root, ID)
    pub_date = get_pub_date(root, ID)
    D = {
        "pmc_id":ID,
        "pub_date":pub_date,
        "keywords":keywords,
        "species":species,
        "publication_journal":journal,
    }
    return D

### Creating dataframe

In [4]:
list_of_metadata = []
with open('neuro_IDs_for_neuroscience.txt', 'r') as f:
    ids = [k.strip() for k in f.readlines()]
    
with open('pralay_sir_pmids.txt', 'r') as f:
    ids += [k.strip() for k in f.readlines()]

for ID in tqdm(ids):
    try:
        list_of_metadata.append(get_metadata(ID))
    except Exception as e:
        with open("metadata_error_logger.txt",'a') as f:
            f.write(f"{ID}:{e}")
            continue
        

df = pd.DataFrame(list_of_metadata) #Creating a pandas df so that it can be exported to a csv/xl file

FileNotFoundError: [Errno 2] No such file or directory: 'neuro_IDs_for_neuroscience.txt'

In [18]:
df.head(10)

Unnamed: 0,pmc_id,pub_date,keywords,species,publication_journal
0,8742975,05-9-2021,Alzheimer’s disease|aphasia|bilingualism|cogni...,,Journal of Anatomy
1,8485740,##-10-2021,,,Cold Spring Harbor Perspectives in Medicine
2,10790504,28-7-2023,,,Perspectives on Psychological Science
3,10777891,16-4-2023,,,Archives of pathology & laboratory medicine
4,10760685,##-##-2024,,,PLOS ONE
5,10739454,10-11-2023,,,Acta Neurochirurgica
6,10733721,##-12-2023,,,Bioinformatics
7,10725770,29-1-2024,literature|consciousness|predictive processing...,,Philosophical Transactions of the Royal Societ...
8,10725766,29-1-2024,aesthetics|predictive processing|cognitive sci...,,Philosophical Transactions of the Royal Societ...
9,10718038,30-11-2015,Anatomy|Physiology|Central nervous system|Join...,,The Journal of Physiological Sciences : JPS


In [27]:
df.to_excel("../meta_data_pmc.xlsx", index=False)

### scratch cells_1 (for experimentation)

In [31]:
ID = "4919512"
root = get_root(ID)
find_specific_kwd_groups_and_extract_kwd_text(root, ID)

Getting root for ID = 4919512


In [5]:
for ID in ids[:20]:
    root = get_root(ID)
    res = get_species(root,ID)
    if res:
        print(res)

Getting root for ID = 7131896
Getting root for ID = 4919512
Getting root for ID = 6372724
Getting root for ID = 4330548
Getting root for ID = 5035135
Getting root for ID = 2998755
Getting root for ID = 6863630
Rat
Getting root for ID = 3332379
Getting root for ID = 4980114
Human
Getting root for ID = 5727393
Getting root for ID = 7058027
Getting root for ID = 4519436
Getting root for ID = 4818601
Getting root for ID = 5142814
Mouse
Getting root for ID = 5484614
Mouse
Getting root for ID = 4113023
Getting root for ID = 6069680
Getting root for ID = 4362325
Getting root for ID = 5548904
Getting root for ID = 5425252
Human


In [6]:
for ID in ids[:20]:
    root = get_root(ID)
    get_keywords(root,ID)

Getting root for ID = 7131896
['anterior cingulate cortex', 'amygdala', 'medial prefrontal cortex', 'coherence', 'spikes', 'local field potential', 'social decision-making'] 

Getting root for ID = 4919512
[] 

Getting root for ID = 6372724
['oligodendrocyte precursor cell', 'oligodendrocyte', 'myelin', 'differentiation', 'glutamate', 'neurotransmitter receptors', 'ion channels', 'glia', 'bioelectricity', 'electrophysiology'] 

Getting root for ID = 4330548
[] 

Getting root for ID = 5035135
['Adolescence', 'Cerebral cortex', 'MRI', 'Replication', 'Sex differences', 'White matter'] 

Getting root for ID = 2998755
[] 

Getting root for ID = 6863630
['cerebral cortex', 'basal ganglia', 'globus pallidus', 'subthalamic nucleus', 'striatum', 'optogenetics', 'Rat'] 

Getting root for ID = 3332379
['Visual motion detection', 'Drosophila melanogaster', 'Lobula plate', 'Columnar cells', 'Patch-clamp recordings'] 

Getting root for ID = 4980114
['hippocampus', 'neural oscillations', 'episodic me

In [14]:
for ID in ids[:20]:
    root = get_root(ID)
    find_specific_kwd_groups_and_extract_kwd_text(root,ID)

Getting root for ID = 7131896
['anterior cingulate cortex', 'amygdala', 'medial prefrontal cortex', 'coherence', 'spikes', 'local field potential', 'social decision-making']
Getting root for ID = 4919512
[]
Getting root for ID = 6372724
['oligodendrocyte precursor cell', 'oligodendrocyte', 'myelin', 'differentiation', 'glutamate', 'neurotransmitter receptors', 'ion channels', 'glia', 'bioelectricity', 'electrophysiology']
Getting root for ID = 4330548
[]
Getting root for ID = 5035135
[]
Getting root for ID = 2998755
[]
Getting root for ID = 6863630
['cerebral cortex', 'basal ganglia', 'globus pallidus', 'subthalamic nucleus', 'striatum', 'optogenetics', 'Rat']
Getting root for ID = 3332379
['Visual motion detection', 'Drosophila melanogaster', 'Lobula plate', 'Columnar cells', 'Patch-clamp recordings']
Getting root for ID = 4980114
['hippocampus', 'neural oscillations', 'episodic memory', 'pattern completion', 'intracranial EEG', 'Human']
Getting root for ID = 5727393
[]
Getting root f

In [13]:
root = get_root("11093584")
find_specific_kwd_groups_and_extract_kwd_text(root)

Getting root for ID = 11093584
['nerve injury', 'axon regeneration', 'sensory neuron', 'motorneuron', 'RNA-sequencing', 'specific regeneration', 'Mouse']


### processing error IDs

In [24]:
error_ids = [10496176,8174055,7583314,5848544,5994217,5614110,560849,
             5100624,4526170,4407839,4353425,3695042,3170050,2850514,
             2826127,2832828,2777251,2784994,2408653,2429996,6016744]
error_ids = [str(ID) for ID in error_ids]

In [32]:
new_error_ids = []
list_of_metadata_2 = []
for i,ID in enumerate(error_ids[:]):
    # try:
    #     data = get_metadata(ID)
    #     #print(f"{i+1}:{data}\n\n\n")
    # except Exception as e:
    #     print(f"\n****{i+1}:{ID}|{e}****\n")
    #     new_error_ids.append(ID)
    data = get_metadata(ID)
    list_of_metadata_2.append(data)
    # print(f"{i+1}:{data}\n\n\n")

In [33]:
df = pd.DataFrame(list_of_metadata_2)
df

Unnamed: 0,pmc_id,pub_date,keywords,species,publication_journal
0,10496176,12-9-2023,Lewy body pathology|Alzheimer’s disease pathol...,,Acta Neuropathologica Communications
1,8174055,26-3-2021,behavior|behavioral neuroscience|chemogenetics...,,eNeuro
2,7583314,22-10-2020,Gal4/UAS|CNS|Neurons|Glia|Driver|Gal4|69B-Gal4,,BMC Genetics
3,5848544,13-3-2018,Amyotrophic lateral sclerosis|Magnetic resonan...,,BMC Neuroscience
4,5994217,19-10-2017,Apoptosis|Necroptosis|Ferroptosis|Minocycline|...,,Molecular Neurobiology
5,5614110,##-##-2017,Neuroscience|Issue 126|Neuromuscular junction|...,,Journal of Visualized Experiments : JoVE
6,560849,00-00-0000,,,xyz
7,5100624,##-10-2016,methodology|spectral quality|neurochemical pro...,,International Journal of Epidemiology
8,4526170,23-4-2015,chronic traumatic encephalopathy|tauopathy|tra...,,Brain Pathology
9,4407839,01-5-2015,glomerulus|ellipsoid body|fan-shaped body|nodu...,,The Journal of Comparative Neurology


In [35]:
df2 = pd.read_csv("meta_data_pmc.csv")

In [36]:
df2.head()

Unnamed: 0,pmc_id,pub_date,keywords,species,publication_journal
0,8742975,05-9-2021,Alzheimer’s disease|aphasia|bilingualism|cogni...,,Journal of Anatomy
1,8485740,##-10-2021,,,Cold Spring Harbor Perspectives in Medicine
2,10790504,28-7-2023,,,Perspectives on Psychological Science
3,10777891,16-4-2023,,,Archives of pathology & laboratory medicine
4,10760685,##-##-2024,,,PLOS ONE


In [37]:
set1 = set(df['pmc_id'])
set2 = set(df2['pmc_id'])
set1.intersection(set2) # Checking of any of the DF's have a common pmc_id or not

set()

In [38]:
len(set1)+len(set2) # Verifying that all 4122 pmc IDs are present in the combined df

4122

In [41]:
df3 = pd.concat((df,df2)) # Finally concatenating the two DFs
df3.head()

Unnamed: 0,pmc_id,pub_date,keywords,species,publication_journal
0,10496176,12-9-2023,Lewy body pathology|Alzheimer’s disease pathol...,,Acta Neuropathologica Communications
1,8174055,26-3-2021,behavior|behavioral neuroscience|chemogenetics...,,eNeuro
2,7583314,22-10-2020,Gal4/UAS|CNS|Neurons|Glia|Driver|Gal4|69B-Gal4,,BMC Genetics
3,5848544,13-3-2018,Amyotrophic lateral sclerosis|Magnetic resonan...,,BMC Neuroscience
4,5994217,19-10-2017,Apoptosis|Necroptosis|Ferroptosis|Minocycline|...,,Molecular Neurobiology


In [62]:
df3['pub_year'] = df3["pub_date"].apply(lambda x: x.split("-")[-1] if isinstance(x, str) and x else np.nan) # Creating an publication year column from publication date, only if the date value is a string and not-none

0       2023
1       2021
2       2020
3       2018
4       2017
        ... 
4096    2011
4097    2013
4098    2017
4099    2012
4100    2013
Name: pub_year, Length: 4122, dtype: object

In [64]:
df3.to_excel("meta_data_pmc.xlsx", index=False) # Saving the final df to an excel