In [None]:

import pyscisci.all as pyscisci

import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

%matplotlib inline

In [None]:
# set this path to where the PubMed database will be stored
path2pubmed = '/home/ajgates/PubMed'


In [None]:
mypubmed = pyscisci.PubMed(path2pubmed, database_extension='csv.gz', keep_in_memory=False) 

# set keep_in_memory=False if you want to load the database each time its needed - good for when you 
# cant keep more than one DataFrame in memory at a time

# otherwise keep_in_memory=True will keep each DataFrame in memory after its loaded

In [None]:
# This function will download the latest baseline version of PubMed.
# Depending on your internet connection, it could take several hours to complete the download.

mypubmed.download_from_source(rewrite_existing=False)
# if your connetion breaks/download stops for any reason, set rewrite_existing = False and 
# rerun to continue downloading where you left off

In [None]:
# before we can start running our analysis, we have to preprocess the raw data into
# DataFrames that are more convinent to work with
mypubmed.preprocess(show_progress=True)


In [None]:
# PubMed creates three dataframes:
# pub  - keeps all of the publication information
# columns : ['PublicationId', 'Title', 'Year', 'Month', 'Day', 'Doi', 'JournalId', 'ISSN', 'Volume', 'Issue', 'Pages', 'TeamSize']

# paa  - links the publications to authors and affiliations 
# NOTE: PubMed does not disambiguate authors!!!
# columns : ['PublicationId', 'FirstName', 'LastName', 'FullName', 'Affiliations', 'AuthorSequence']

# pub2field  - links the publications to fields (aka subjectAreas)
# columns : ['PublicationId', 'FieldId']

# pub2ref  - keeps the citation information
# columns : ['CitingPublicationId', 'CitedPublicationId']


In [None]:
# lets plot the number of publications each year
yearly_articles = pyscisci.groupby_count(df=mypubmed.pub , colgroupby='Year', colcountby='PublicationId', count_unique=True,
                               show_progress=True)
yearly_articles.sort_values(by='Year', inplace=True)

fig, ax = plt.subplots(1,1,figsize=(8,5))

ax.plot(yearly_articles['Year'],yearly_articles['PublicationIdCount'])

ax.set_xlabel('Year')
ax.set_ylabel("# of publications")
ax.set_yscale('log')

plt.show()