# Citation Graph

Retrieve the citation graph of a set of papers related to a keyword.

In [1]:
%load_ext autoreload
%autoreload 2

In [74]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

from Bio import Entrez
Entrez.email = "samuel.ortion@etud.univ-evry.fr"

import pubmed.explorer


## Retrieve a list of PMIDS

In [84]:
start_date = 2010
end_date = 2022
query = "cancer"
explorer = pubmed.explorer.Explorer()
downloaded = True # Change this if you do not already have the csv.
if not downloaded:
    raw_data = list(explorer.query(query, start_date, end_date, maxret=100))
    extracted_data = list(
    map(
        explorer.extract_info,
        raw_data)
    )
    if None in extracted_data:
        extracted_data.remove(None)
    data_series = dict(
        pmid=[],
        title=[],
        doi=[],
        main_author=[],
        date=[],
        keywords=[],
    )

    for row in extracted_data:
        for key in row:
            data_series[key].append(row[key])

    df = pd.DataFrame(data_series)
    df.to_csv(f"../tmp/pubmed_{query}_{start_date}-{end_date}.csv", index=False)
    with open(f"../tmp/pubmed_{query}_{start_date}-{end_date}.pkl", "wb") as pickle_file:
        pickle.dump(list(raw_data), pickle_file)
else:
    df = pd.read_csv(f"../tmp/pubmed_{query}_{start_date}-{end_date}.csv")
    with open(f"../tmp/pubmed_{query}_{start_date}-{end_date}.pkl", "rb") as pickle_file:
        raw_data = pickle.load(pickle_file)

In [85]:
df.head()

Unnamed: 0,pmid,title,doi,main_author,date,keywords
0,37892700,Suicide among Cancer Patients: Current Knowled...,6563,"Grobman, Ben",2023-10-16,"['cancer', 'health policy', 'methodological re..."
1,37887562,Systematic Review of Nomograms Used for Predic...,10.3390/curroncol30100662,"Antonini, Marcelo",2023-10-16,"['breast neoplasms', 'cancer', 'neoadjuvant th..."
2,37658271,Insufficient Reporting of Race and Ethnicity i...,10.1245/s10434-023-14201-z,"Keegan, Grace",2023-09-01,
3,37637069,Global research status and hotspots of radioth...,1135052,"Xie, Xiaodu",2023-08-11,"['CiteSpace', 'VOSviewer', 'bibliometric', 'ho..."
4,37574575,"Advance care planning, serious illness communi...",10.21037/apm-22-1261,"Rao, Vinay",2023-07-27,"['Gastrointestinal cancer', 'advance care plan..."


## Retrieve the citation graph

Get the references cited in a paper, and add an arc when one of this reference heads to an other pmid.


In [88]:
first = list(raw_data)[0]
first

{'PubmedBookArticle': [], 'PubmedArticle': [{'MedlineCitation': DictElement({'OtherAbstract': [], 'CitationSubset': [], 'SpaceFlightMission': [], 'GeneralNote': [], 'OtherID': [], 'KeywordList': [ListElement([StringElement('cancer', attributes={'MajorTopicYN': 'N'}), StringElement('health policy', attributes={'MajorTopicYN': 'N'}), StringElement('methodological review', attributes={'MajorTopicYN': 'N'}), StringElement('observational studies', attributes={'MajorTopicYN': 'N'}), StringElement('suicide', attributes={'MajorTopicYN': 'N'})], attributes={'Owner': 'NOTNLM'})], 'PMID': StringElement('37892700', attributes={'Version': '1'}), 'DateRevised': {'Year': '2023', 'Month': '10', 'Day': '30'}, 'Article': DictElement({'Language': ['eng'], 'ELocationID': [StringElement('6563', attributes={'EIdType': 'pii', 'ValidYN': 'Y'}), StringElement('10.3390/jcm12206563', attributes={'EIdType': 'doi', 'ValidYN': 'Y'})], 'ArticleDate': [DictElement({'Year': '2023', 'Month': '10', 'Day': '16'}, attribu

In [89]:
first.keys()

dict_keys(['PubmedBookArticle', 'PubmedArticle'])

In [90]:
first["PubmedArticle"][0].keys()

dict_keys(['MedlineCitation', 'PubmedData'])

In [91]:
first["PubmedArticle"][0]["PubmedData"].keys()

dict_keys(['ReferenceList', 'History', 'PublicationStatus', 'ArticleIdList'])

In [28]:
first["PubmedArticle"][0]["PubmedData"]["ReferenceList"][0]["Reference"][0]["ArticleIdList"][1]

StringElement('36633525', attributes={'IdType': 'pubmed'})

In [93]:
explorer.get_cited_pmids(first)[:6]

['36633525', '37141156', '34408877', '30643135', '31745237', '32538322']

In [94]:
adjacency_matrix = pd.DataFrame(np.zeros((df.shape[0], df.shape[0])))
adjacency_matrix.index = df["pmid"]
adjacency_matrix.columns = df["pmid"]

for index, pmid in enumerate(df["pmid"]):
    efetch_record = raw_data[index]
    try:
        cited_pmids = get_cited_pmids(efetch_record)
    except Exception as e:
        continue
    for cited_pmid in cited_pmids:
        if cited_pmid in adjacency_matrix.columns:
            adjacency_matrix.loc[pmid, cited_pmid] = 1