In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import json
import re
import operator
from itertools import chain

import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint

sns.set_style("darkgrid")

In [120]:
# helpers to deal with various datatypes
def JSONParser(data):
    j = json.loads(data)
    if j: 
        return j
    else:
        return None

converters={"pmid": str,
       "doi": str,
       "title": str,
       "journal":str,
       "pub_year":int,
       "pub_types":JSONParser,
       "mesh_terms":JSONParser,
       "grants":JSONParser,
       "authors":JSONParser,
       "author_affils":JSONParser}

# read in data
df = pd.read_csv("../data/full_cancer_data.csv", converters=converters)

In [261]:
# helpers to deal with various datatypes
def JSONParser(data):
    j = json.loads(data)
    if j: 
        return j
    else:
        return None

converters={"pmid": str,
            "timestamp": str,
            "am_response":JSONParser}

# read in data
altmetric = pd.read_csv("../data/altmetrics/pubmed.csv", converters=converters)
altmetric = altmetric.set_index('pmid')

In [238]:
# Deduplicate !!still need to look into that!!
df = df[~df.pmid.duplicated(keep='first')]

# 2016 subset
df16 = df[df['pub_year'] == 2016]
df16.columns

Index(['pmid', 'doi', 'title', 'journal', 'pub_year', 'pub_types',
       'mesh_terms', 'grants', 'authors', 'author_affils'],
      dtype='object')

## Create MeSH desc and qual dummies

Terms in the selected subtrees of a top 13 MeSH descriptor are aggregated as top-level terms

In [99]:
mesh_top13 =  ["Urinary Bladder Neoplasms",
            "Breast Neoplasms",
            "Colorectal Neoplasms",
            "Endometrial Neoplasms",
            "Kidney Neoplasms",
            "Leukemia",
            "Liver Neoplasms",
            "Lung Neoplasms",
            "Melanoma",
            "Lymphoma, Non-Hodgkin",
            "Pancreatic Neoplasms",
            "Prostatic Neoplasms",
            "Thyroid Neoplasms"]

mesh_qual = ["diagnosis",
            "diagnostic imaging",
            "mortality",
            "therapy",
            "diet therapy",
            "drug therapy",
            "nursing",
            "prevention & control",
            "radiotherapy",
            "rehabilitation",
            "surgery",
            "transplantation"]

In [70]:
terms = {}
numbers = {}
 
meshFile = '../data/mesh_2018/d2018.bin'
with open(meshFile, mode='r') as file:
    mesh = file.readlines()
    
for line in mesh:
    meshTerm = re.search('MH = (.+)$', line)
    if meshTerm:
        term = meshTerm.group(1)
    meshNumber = re.search('MN = (.+)$', line)
    if meshNumber:
        number = meshNumber.group(1)
        if number == None:
            print("yes")
        numbers[number] = term
        if term in terms:
            terms[term] = terms[term] + ' ' + number
        else:
            terms[term] = number

In [222]:
# only use disease mesh terms
c_keys = [key for key in sorted(list(numbers.keys())) if key[0] == "C"]

lookups = {}
for t in meshterms:
    x = []
    for subnr in str.split(terms[t], " "):
        x.extend([str.lower(numbers[key]) for key in c_keys if subnr in key])
    lookups[t] = set(x)

In [307]:
out = df16[['pmid', 'doi','title','pub_year', 'mesh_terms']]
out = out.set_index('pmid')
add_cols = ['news_mention']
out = pd.concat([out,
                 altmetric['am_response'],
                 pd.DataFrame(np.zeros((28372, 13), dtype=bool), index=out.index, columns=mesh_top13, dtype=bool),
                 pd.DataFrame(np.zeros((28372, 12), dtype=bool), index=out.index, columns=mesh_qual, dtype=bool),
                 pd.DataFrame(np.zeros((28372, len(add_cols)), dtype=bool), index=out.index, columns=add_cols, dtype=bool)],
                axis=1, join_axes=[out.index])

In [311]:
def create_dummies(row):
    if row.mesh_terms:
        for mt,mqs in row.mesh_terms.items():
            for topterm, subterms in lookups.items():
                if mt in subterms:
                    row[topterm] = True
                for mq in mqs:
                    if mq in mesh_qual:
                        row[mq] = True
       
    try:
        row['am_response']['posts']['news']
        row['news_mention'] = True
    except:
        None
    return row

out = out.apply(create_dummies, axis=1)
out[mesh_qual+mesh_top13+['news_mention']] = out[mesh_qual+mesh_top13+['news_mention']].astype(float)

In [321]:
df_out = out.drop(['mesh_terms', 'am_response'], axis=1)
df_out.to_csv("test.csv")

# Create Dummy Variables for Mesh Desc+Qual Pairs

In [None]:
def mesh_hash(desc, qual=None):
    if qual:
        a = "-".join(list(map(lambda x: x[:4], desc.split(' ')[:1] if ' ' in desc else [desc])))
        if qual == 'diagnostic imaging':
            b = "-".join(list(map(lambda x: x[:4], qual.split(' ')[:1] if ' ' in qual else [qual]))) + "-img"
        else:
            b = "-".join(list(map(lambda x: x[:4], qual.split(' ')[:1] if ' ' in qual else [qual])))
        return '{}_{}'.format(a,b)
    else:
        return "-".join(list(map(lambda x: x[:4], desc.split(' ')[:1] if ' ' in desc else [desc])))
    
categories=[]
for desc in mesh_top13:
    categories.append(mesh_hash(desc))
    for qual in meshqual:
        categories.append(mesh_hash(desc, qual))
categories.append('pmid')

# dummy variables for MeSH terms
rows = []
for rid, row in df16[['pmid','mesh_terms']].iterrows():
    temp_dict = [0] * 169 + [row['pmid']]
    
    if row['mesh_terms']:
        mesh = { a: row['mesh_terms'][a] for a in meshterms if a in row['mesh_terms']}
        for key, quals in mesh.items():
            temp_dict[categories.index(mesh_hash(key))] = 1
            for qual in quals:
                if qual in meshqual:
                    temp_dict[categories.index(mesh_hash(key, qual))] = 1

    rows.append(temp_dict)
mesh_dummies = pd.DataFrame(rows, columns=categories)

In [None]:
# dummy variables for the publication types
pubtype_dummies = pd.get_dummies(pd.DataFrame(df16.pub_types.values.tolist()),prefix_sep='', prefix='')

## Merge and export CSV

In [None]:
result = df16[["pmid","doi","title","journal","pub_year","pub_types","grants","authors","author_affils"]]    
result.merge(mesh_dummies, on='pmid', how='inner')

# Write results to CSV
result.to_csv("../data/output/2016_with_dummies.csv")