In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import json
import operator
from itertools import chain

import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint

In [None]:
sns.set_style("darkgrid")

In [None]:
# helpers to deal with various datatypes
def JSONParser(data):
    j = json.loads(data)
    if j: 
        return j
    else:
        return None

converters={"pmid": str,
       "doi": str,
       "title": str,
       "journal":str,
       "pub_year":int,
       "pub_types":JSONParser,
       "mesh_terms":JSONParser,
       "grants":JSONParser,
       "authors":JSONParser,
       "author_affils":JSONParser}

# read in data
df = pd.read_csv("../data/full_cancer_data.csv", converters=converters)

In [None]:
# Deduplicate !!still need to look into that!!
df = df[~df.pmid.duplicated(keep='first')]

In [None]:
# 2016 subset
df16 = df[df['pub_year'] == 2016]

In [None]:
# Seleceted MeSH descriptors and qualifiers
# all keywords have been transformed to lowercase
meshterms = ["urinary bladder neoplasms",
            "breast neoplasms",
            "colorectal neoplasms",
            "endometrial neoplasms",
            "kidney neoplasms",
            "leukemia",
            "liver neoplasms",
            "lung neoplasms",
            "melanoma",
            "lymphoma, non-hodgkins",
            "pancreatic neoplasms",
            "prostatic neoplasms",
            "thyroid neoplasms"]

meshqual = ["diagnosis",
            "diagnostic imaging",
            "mortality",
            "therapy",
            "diet therapy",
            "drug therapy",
            "nursing",
            "prevention & control",
            "radiotherapy",
            "rehabilitation",
            "surgery",
            "transplantation"]

In [None]:
def mesh_hash(desc, qual=None):
    if qual:
        a = "-".join(list(map(lambda x: x[:4], desc.split(' ')[:1] if ' ' in desc else [desc])))
        if qual == 'diagnostic imaging':
            b = "-".join(list(map(lambda x: x[:4], qual.split(' ')[:1] if ' ' in qual else [qual]))) + "-img"
        else:
            b = "-".join(list(map(lambda x: x[:4], qual.split(' ')[:1] if ' ' in qual else [qual])))
        return '{}_{}'.format(a,b)
    else:
        return "-".join(list(map(lambda x: x[:4], desc.split(' ')[:1] if ' ' in desc else [desc])))
    
categories=[]
for desc in meshterms:
    categories.append(mesh_hash(desc))
    for qual in meshqual:
        categories.append(mesh_hash(desc, qual))
categories.append('pmid')

## Dummy variables

In [None]:
# dummy variables for MeSH terms
rows = []
for rid, row in df16[['pmid','mesh_terms']].iterrows():
    temp_dict = [0] * 169 + [row['pmid']]
    
    if row['mesh_terms']:
        mesh = { a: row['mesh_terms'][a] for a in meshterms if a in row['mesh_terms']}
        for key, quals in mesh.items():
            temp_dict[categories.index(mesh_hash(key))] = 1
            for qual in quals:
                if qual in meshqual:
                    temp_dict[categories.index(mesh_hash(key, qual))] = 1

    rows.append(temp_dict)
mesh_dummies = pd.DataFrame(rows, columns=categories)

In [None]:
# dummy variables for the publication types
pubtype_dummies = pd.get_dummies(pd.DataFrame(df16.pub_types.values.tolist()),prefix_sep='', prefix='')

## Merge and export CSV

In [None]:
result = df16[["pmid","doi","title","journal","pub_year","pub_types","grants","authors","author_affils"]]    
result.merge(mesh_dummies, on='pmid', how='inner')

# Write results to CSV
result.to_csv("../data/output/2016_with_dummies.csv")