In [5]:
%matplotlib inline

import pandas as pd
import numpy as np
import json
import re
import operator
from itertools import chain
from dateutil.parser import parse

from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

from pprint import pprint




In [6]:
# helpers to deal with various datatypes
def JSONParser(data):
    j = json.loads(data)
    if j: 
        return j
    else:
        return None

converters={"pmid": str,
       "doi": str,
       "title": str,
       "journal":str,
       "pub_year":int,
       "pub_types":JSONParser,
       "mesh_terms":JSONParser,
       "grants":JSONParser,
       "authors":JSONParser,
       "author_affils":JSONParser}

# read in data
df = pd.read_csv("../data/full_cancer_data.csv", converters=converters)

In [7]:
# helpers to deal with various datatypes
def JSONParser(data):
    j = json.loads(data)
    if j: 
        return j
    else:
        return None

converters={"pmid": str,
            "timestamp": str,
            "am_response":JSONParser}

# read in data
altmetric = pd.read_csv("../data/altmetrics/pubmed.csv", converters=converters)
altmetric = altmetric.set_index('pmid')

In [8]:
# Deduplicate !!still need to look into that!!
df = df[~df.pmid.duplicated(keep='first')]
df = df.set_index("pmid")
df = df[df['pub_year'] == 2016]
df.columns

Index(['doi', 'title', 'journal', 'pub_year', 'pub_types', 'mesh_terms',
       'grants', 'authors', 'author_affils'],
      dtype='object')

# PubMed articles

## Create MeSH desc and qual dummies

Terms in the selected subtrees of a top 13 MeSH descriptor are aggregated as top-level terms

In [9]:
mesh_top13 =  ["Urinary Bladder Neoplasms",
            "Breast Neoplasms",
            "Colorectal Neoplasms",
            "Endometrial Neoplasms",
            "Kidney Neoplasms",
            "Leukemia",
            "Liver Neoplasms",
            "Lung Neoplasms",
            "Melanoma",
            "Lymphoma, Non-Hodgkin",
            "Pancreatic Neoplasms",
            "Prostatic Neoplasms",
            "Thyroid Neoplasms"]

mesh_qual = ["diagnosis",
             "diagnostic imaging",
             "mortality",
             "therapy",
             "diet therapy",
             "drug therapy",
             "nursing",
             "prevention & control",
             "radiotherapy",
             "rehabilitation",
             "surgery",
             "transplantation"]

funding_types = ["Research Support, American Recovery and Reinvestment Act",
                 "Research Support, N.I.H., Extramural",
                 "Research Support, N.I.H., Intramural",
                 "Research Support, Non-U.S. Gov't",
                 "Research Support, U.S. Gov't, Non-P.H.S.",
                 "Research Support, U.S. Gov't, P.H.S.",
                 "Research Support, U.S. Government"]

## Prepare MeSH terms

In [10]:
terms = {}
numbers = {}
 
meshFile = '../data/mesh_2018/d2018.bin'
with open(meshFile, mode='r') as file:
    mesh = file.readlines()
    
for line in mesh:
    meshTerm = re.search('MH = (.+)$', line)
    if meshTerm:
        term = meshTerm.group(1)
    meshNumber = re.search('MN = (.+)$', line)
    if meshNumber:
        number = meshNumber.group(1)
        if number == None:
            print("yes")
        numbers[number] = term
        if term in terms:
            terms[term] = terms[term] + ' ' + number
        else:
            terms[term] = number
            
# only use disease mesh terms
c_keys = [key for key in sorted(list(numbers.keys())) if key[0] == "C"]

# Create list of relevant meshterms for top 13 terms
lookups = {}
for t in mesh_top13:
    x = []
    for subnr in str.split(terms[t], " "):
        x.extend([str.lower(numbers[key]) for key in c_keys if subnr in key])
    lookups[t] = set(x)

## Create dummy variables for MeSH, funding, and news

In [11]:
out = df[['doi','title','pub_year', 'mesh_terms', 'pub_types']]
add_cols = ['news_mention', 'news_count']
out = pd.concat([out,
                 altmetric['am_response'],
                 pd.DataFrame(np.zeros((28372, 13), dtype=bool), index=out.index, columns=mesh_top13, dtype=bool),
                 pd.DataFrame(np.zeros((28372, 12), dtype=bool), index=out.index, columns=mesh_qual, dtype=bool),
                 pd.DataFrame(np.zeros((28372, 7), dtype=bool), index=out.index, columns=funding_types, dtype=bool),
                 pd.DataFrame(np.zeros((28372, len(add_cols)), dtype=bool), index=out.index, columns=add_cols, dtype=bool)
                ],
                axis=1, join_axes=[out.index])

In [12]:
def create_dummies(row):
    # create dummies for mesh terms
    if row.mesh_terms:
        for mt,mqs in row.mesh_terms.items():
            for topterm, subterms in lookups.items():
                if mt in subterms:
                    row[topterm] = True
                for mq in mqs:
                    if mq in mesh_qual:
                        row[mq] = True
                        
    # create dummies for funding type
    if row.pub_types:
        for funding in funding_types:
            if funding in row.pub_types:
                row[funding] = True
    
    # count news mentions
    try:
        row['news_count'] = row['am_response']['counts']['news']['posts_count']
        row['news_mention'] = True
    except:
        None
    return row

out = out.progress_apply(create_dummies, axis=1)

# convert bool to float columns
out[mesh_qual+mesh_top13+funding_types+add_cols] = out[mesh_qual+mesh_top13+funding_types+add_cols].astype(float)




## Export files

In [24]:
out[['doi', 'pub_year', 'title', 'news_mention', 'news_count']].to_csv("metadata_and_news.csv")
out[mesh_top13].to_csv("mesh_term_dummies.csv")
out[mesh_qual].to_csv("mesh_subterm_dummies.csv")
out[funding_types].to_csv("funding_dummies.csv")

# News coverage

In [51]:
pmids = []
news_mentions = []

for pmid, row in altmetric[~altmetric.am_response.isnull()].iterrows():
    try:
        news_mentions.append(row['am_response']['posts']['news'])
        pmids.append(pmid)
    except:
        pass
    
article_pmids = []
venue_name = []
venue_url = []
date = []
summary = []
title = []
url = []

for pmid, nms in zip(pmids, news_mentions):
    for nm in nms:
        article_pmids.append(pmid)
        venue_name.append(nm['author']['name'])
        venue_url.append(nm['author']['url'])
        date.append(str(parse(nm['posted_on'])))
        try:
            summary.append(nm['summary'])
        except:
            summary.append(None)
        try:
            title.append(nm['title'])
        except:
            title.append(None)
        url.append(nm['url'])

## Export news coverage details

In [52]:
news_out = pd.DataFrame({
    'pmid': article_pmids,
    'venue_name': venue_name,
    'venue_url': venue_url,
    'date': date,
    'summary': summary,
    'title': title,
    'url': url
})

news_out.to_csv('news_coverage_details.csv')