# Article type analysis for BioCreative test set

Tong Shu Li<br>
Created on: 2015-11-23<br>
Last updated: 2015-11-24

Do the types of PubMed abstracts affect the performance of the crowd and machine learning systems? Is the gold standard more consistent for some types of articles?

In [1]:
from collections import defaultdict
import os
import pandas as pd
import pickle
import sys

In [2]:
sys.path.append("..")

In [3]:
from src.get_mesh_terms import Article

from src.data_model import parse_file

from src.lingpipe.file_util import save_file

---

## Grab all PMIDs and get their summaries from PubMed

In [4]:
def read_gold_standard(dataset, file_format = "list"):
    assert dataset in ["training", "development", "test"]
    assert file_format in ["list", "dict"]
    
    fname = "parsed_{0}_set_{1}.pickle".format(dataset, file_format)
    
    save_loc = os.path.abspath(os.path.join("..", "data", "gold_standard", fname))
    
    fname = "CDR_{0}Set.txt".format(dataset.capitalize())
    loc = os.path.abspath(os.path.join("..", "data", "gold_standard"))
    
    return parse_file(save_loc, loc = loc, fname = fname,
        is_gold = True, return_format = file_format, fix_acronyms = False)

In [5]:
eval_gold = read_gold_standard("test", file_format = "dict")

In [6]:
def grab_mesh(papers):
    loc = os.path.abspath(os.path.join("..", "data", "gold_standard", "testset_mesh_terms.pickle"))

    res = save_file(loc)
    if res is not None:
        return res
    
    res = dict()
    for pmid in papers:
        res[pmid] = Article(pmid)
        
    save_file(loc, res)
    return res

In [7]:
mesh_terms = grab_mesh(set(eval_gold.keys()))

---

### Combine publication type information into one dataframe

In [8]:
pub_types = []
for pmid, article in mesh_terms.items():
    article.pub_type["pmid"] = pmid
    
    pub_types.append(article.pub_type)
    
pub_types = pd.concat(pub_types)

In [9]:
pub_types.head()

Unnamed: 0,pub_name,pub_type,pmid
0,Journal Article,D016428,23433219
1,"Research Support, Non-U.S. Gov't",D013485,23433219
0,Journal Article,D016428,1360900
1,"Research Support, Non-U.S. Gov't",D013485,1360900
0,Case Reports,D002363,733189


In [10]:
pub_types["pub_name"].value_counts()

Journal Article                             495
Research Support, Non-U.S. Gov't            156
Case Reports                                140
Comparative Study                            46
Clinical Trial                               43
Research Support, U.S. Gov't, P.H.S.         40
Randomized Controlled Trial                  29
Review                                       20
Research Support, N.I.H., Extramural         16
Multicenter Study                             9
Controlled Clinical Trial                     8
Clinical Trial, Phase II                      8
Research Support, U.S. Gov't, Non-P.H.S.      6
Observational Study                           3
JOURNAL ARTICLE                               3
Research Support, N.I.H., Intramural          3
Meta-Analysis                                 2
Clinical Trial, Phase I                       2
Evaluation Studies                            2
Letter                                        1
Newspaper Article                       

In [11]:
# how many different labels does each paper have?
pd.Series([len(x) for x in pub_types.groupby("pmid").groups.values()]).value_counts()

2    268
1    124
3     71
4     28
5      6
6      3
dtype: int64

In [12]:
for i, group in pub_types.groupby("pub_name"):
    print(i)
    print(group["pub_type"].unique())

Case Reports
['D002363']
Clinical Trial
['D016430']
Clinical Trial, Phase I
['D017426']
Clinical Trial, Phase II
['D017427']
Comparative Study
['D003160']
Controlled Clinical Trial
['D018848']
Evaluation Studies
['D023362']
JOURNAL ARTICLE
['']
Journal Article
['D016428']
Letter
['D016422']
Meta-Analysis
['D017418']
Multicenter Study
['D016448']
Newspaper Article
['D018431']
Observational Study
['D064888']
Randomized Controlled Trial
['D016449']
Research Support, N.I.H., Extramural
['D052061']
Research Support, N.I.H., Intramural
['D052060']
Research Support, Non-U.S. Gov't
['D013485']
Research Support, U.S. Gov't, Non-P.H.S.
['D013486']
Research Support, U.S. Gov't, P.H.S.
['D013487']
Review
['D016454']


In [13]:
# convert "JOURNAL ARTICLE" with no MeSH id to Journal Article D016428
pub_types.loc[pub_types["pub_name"] == 'JOURNAL ARTICLE', "pub_type"] = "D016428"
pub_types.loc[pub_types["pub_name"] == 'JOURNAL ARTICLE', "pub_name"] = "Journal Article"

### We want to divide the 500 papers into distinct categories

Source of research money is not an interesting feature in terms of the structure of the text or content. Therefore we will disregard this information.

In [14]:
pub_types["research_support"] = pub_types.loc[:, "pub_name"].map(lambda s: s.startswith("Research Support"))

In [15]:
pub_types = pub_types.query("~research_support")

In [16]:
pub_types = pub_types.query("pub_name != 'Multicenter Study'")

In [17]:
pub_types["pub_name"].value_counts()

Journal Article                498
Case Reports                   140
Comparative Study               46
Clinical Trial                  43
Randomized Controlled Trial     29
Review                          20
Controlled Clinical Trial        8
Clinical Trial, Phase II         8
Observational Study              3
Evaluation Studies               2
Meta-Analysis                    2
Clinical Trial, Phase I          2
Letter                           1
Newspaper Article                1
dtype: int64

In [18]:
pub_types["clinical_trial"] = pub_types.loc[:, "pub_name"].map(lambda s: "Trial" in s)

In [19]:
pub_types.head()

Unnamed: 0,pub_name,pub_type,pmid,research_support,clinical_trial
0,Journal Article,D016428,23433219,False,False
0,Journal Article,D016428,1360900,False,False
0,Case Reports,D002363,733189,False,False
1,Journal Article,D016428,733189,False,False
0,Clinical Trial,D016430,11745287,False,True


---

## Create three distinct categories and save info to file

In [20]:
case_reports = set(pub_types.query("pub_name == 'Case Reports'")["pmid"])
clin_trials = set(pub_types.query("clinical_trial == True")["pmid"])

In [21]:
len(case_reports)

140

In [22]:
len(clin_trials)

51

In [23]:
len(clin_trials & case_reports)

1

In [24]:
clin_trials & case_reports

{11988250}

In [25]:
regular_articles = set(eval_gold.keys()) - case_reports - clin_trials

In [26]:
len(regular_articles)

310

In [27]:
res = pd.Series(list(eval_gold.keys())).to_frame("pmid")

In [28]:
res["regular"] = res["pmid"].map(lambda v: v in regular_articles)
res["case_report"] = res["pmid"].map(lambda v: v in case_reports)
res["clinical_trial"] = res["pmid"].map(lambda v: v in clin_trials)

In [29]:
res.head()

Unnamed: 0,pmid,regular,case_report,clinical_trial
0,23433219,True,False,False
1,1360900,True,False,False
2,733189,False,True,False
3,11745287,False,False,True
4,25907210,True,False,False


In [30]:
loc = os.path.abspath(os.path.join("..", "data", "final_eval", "analysis", "testset_article_types.pickle"))
save_file(loc, res)