March 6, 2022

**Analysis**
## Identifying science in the news: An assessment of the precision and recall of Altmetric.com news mention data

_Juan Pablo Alperin, ScholCommLab/School of Publishing, Simon Fraser University_



**Related Publication:**

Fleerackers, A., Nehrig, L., Maggio, L.A., Enkhbayar, A., Moorhead, L., Alperin, J.P., (2022). Identifying science in the news: An assessment of the precision and recall of Altmetric.com news mention data. _arXiv_


**Related Data:**

Fleerackers, Alice; Nehring, Lise; Alperin, Juan Pablo; Enkhbayar, Asura; Maggio, Lauren A.; Moorhead, Laura, 2022, "Replication data for Identifying science in the news", [https://doi.org/10.7910/DVN/WNDOFL](https://doi.org/10.7910/DVN/WNDOFL), _Harvard Dataverse_, V1, UNF:6:k9Hv0lysKrB+tQLkdOEZOw== [fileUNF] 

In [None]:
# Set this flag to True if you want to download the dataset
download_files = True

if download_files:   
    import os
    from pyDataverse.api import NativeApi, DataAccessApi
    from pyDataverse.models import Dataverse
    
    if not os.path.exists('data/'):
        os.makedirs('data/')

    base_url = 'https://dataverse.harvard.edu/'

    api = NativeApi(base_url)
    data_api = DataAccessApi(base_url)


    DOI = "doi:10.7910/DVN/WNDOFL"
    dataset = api.get_dataset(DOI)
    
    files_list = dataset.json()['data']['latestVersion']['files']

    for file in files_list:
        filename = file["dataFile"]["filename"]
        file_id = file["dataFile"]["id"]
        print("File name {}, id {}".format(filename, file_id))

        response = data_api.get_datafile(file_id)
        with open('data/' + filename, "wb") as f:
            f.write(response.content)

In [None]:
import numpy as np
import pandas as pd

In [None]:
# some helper functions to later help match URLs
def clean_url(url):
    return url.split('?')[0].strip('/').lower()

def clean_doi(doi):
    try: 
        if type(doi) == int:
            doi = str(doi)
        return doi.strip('/. ').lower()
    except:
        return np.nan
    
def make_bool(x):
    if x == 0 or x == 'No':
        return False
    if x == 1 or x == 'Yes':
        return True
    return None

In [None]:
id_cols = ['DOI', 'ISBN', 'clinical_trial_id', 'URI', 'pubmed_id', 'pmc_id', 'handle', 'ads_bibcode', 'arxiv_id', 'repec_id', 'SSRN', 'URN']
dtypes = {c: str for c in id_cols}

# Data downloaded from Altmetric Explorer on Sept 9, 2021
# Filter for all research mentions in the following outlets since March 1, 2021:
#    The Guardian, HealthDay, IFLScience, MedPage Today, News Medical, New York Times, Popular Science, and Wired
alt = pd.read_table('data/altmetric_dataset.tab', sep="\t", dtype=dtypes, encoding='utf8')
alt['URL'] = alt.URL.map(clean_url)
alt['DOI'] = alt.DOI.map(clean_doi)
alt.loc[:,'alt_id'] = alt.index

alt['outlet'] = alt.outlet.map(lambda x: x.strip())

alt.loc[:,id_cols] = alt.loc[:,id_cols].applymap(lambda x: x.lower() if x == x else x)

In [None]:
# We found some errors in the identifiers of the Altmetric data that we could obviously correct manually
url = 'https://www.iflscience.com/space/european-satellite-finds-12-very-rare-einstein-crosses'
i = '2012'
alt.loc[(alt.URL == url) & (alt.arxiv_id == i), 'arxiv_id'] = '2012.10051'

url = 'https://www.wired.com/story/mathematicians-settle-the-erdos-coloring-conjecture'
i = '2101'
alt.loc[(alt.URL == url) & (alt.arxiv_id == i), 'arxiv_id'] = '2101.04698'

url = 'https://www.news-medical.net/news/20210421/research-offers-new-insights-on-the-significance-of-hyperinflammation-following-sars-cov-2-infection.aspx'
i = '10.1002/(issn)1529-0131'
alt.loc[(alt.URL == url) & (alt.DOI == i), 'DOI'] = '10.1002/art.41763'

**Some code we used to clean up and standarize the data we coded**

**Final output file in published dataset**

In [None]:
# gold = pd.read_excel('dataset/gold.xlsx', engine='openpyxl')

# gold['URL'] = gold.URL.map(clean_url)
# gold['DOI'] = gold.DOI.map(clean_doi)

# gold.loc[:,'outlet'] = gold.outlet.str.strip()
# gold.loc[:,code_cols] = gold.loc[:,code_cols].applymap(make_bool)
# gold.rename(columns={'DOI': 'identifier'}, inplace=True)

# gold.loc[:,'identifier'] = gold.identifier

# s = 'arxiv:'
# gold.loc[gold.identifier.str.startswith(s, na=False),'identifier'] = gold.loc[gold.identifier.str.startswith(s, na=False),'identifier'].map(lambda x: x[len(s):])
# s = 'pmid: '
# gold.loc[gold.identifier.str.startswith(s, na=False),'identifier'] = gold.loc[gold.identifier.str.startswith(s, na=False),'identifier'].map(lambda x: x[len(s):])

# gold['ResearchMentioned'] = gold.identifier.notnull()
# gold['gold_id'] = gold.index

# gold.loc[:,code_cols] = gold.loc[:,code_cols].applymap(lambda x: int(x) if type(x) == bool else None)

# # lowercase pmc and crinical trial
# gold.to_excel('content_analysis_dataset.csv', index=False)

In [None]:
# Read in coded data
code_cols = ['Aggregated', 'PressRelease', 'ResearchMentioned', 'DescribesAsresearch', 'HasLink', 'JournalMentioned', 'AuthorMentioned', 'InstitutionMentioned', 'StudyDateMentioned']
gold = pd.read_table('data/content_analysis_dataset.tab')

### Table 1. Number of stories and mentions across news outlets

In [None]:
tmp = gold[gold.ResearchMentioned == 1].groupby(['outlet'])['URL'].agg(['nunique', 'size'])
tmp['n'] = gold.groupby(['outlet'])['URL'].nunique().astype(int)
tmp['pct'] = tmp['nunique'].divide(tmp['n']).multiply(100).round(0)
tmp['avg'] = tmp['size'].divide(tmp['nunique']).round(1)
tmp = tmp[['n', 'nunique', 'pct', 'size', 'avg']]
tmp.loc['Total'] = tmp.agg({'n': 'sum',
                           'nunique': 'sum',
                           'pct': 'mean',
                           'size': 'sum',
                           'avg': 'mean'})
tmp.loc[:,['n', 'nunique', 'pct', 'size']] = tmp.loc[:,['n', 'nunique', 'pct', 'size']].astype(int)
tmp['n'] = tmp['n'].astype(int)
tmp['nunique'] = tmp['nunique'].astype(int)
tmp['size'] = tmp['size'].astype(int)
tmp['avg'] = tmp['avg'].round(1)

tmp.columns = ['Num Stories', 'Num Stories w/ Mentions', 'Percent Stories w/ Mentions', 'Num Mentions', 'Average Mentions / Story']

tmp.to_clipboard()
tmp

### Table 2. How research was mentioned across news outlets

In [None]:
mention_cols = ['DescribesAsresearch', 'HasLink', 'JournalMentioned', 'AuthorMentioned', 'InstitutionMentioned', 'StudyDateMentioned']
def pct(x):
    return "{:.0f}".format(100*x.sum()/len(x))

agg = {x: ['sum', pct] for x in mention_cols}
agg['URL'] = 'count'

df = gold[gold.ResearchMentioned == True].groupby('outlet').agg(agg)
total = gold[gold.ResearchMentioned == True].groupby('ResearchMentioned').agg(agg)
total.index = ['Total']
df = df.append(total)
df = df.astype(int)
# columns = []
# for i, c in enumerate(df.columns): 
#     x = c[0]
#     if i % 2 == 1: 
#         x = ''
#     if c[1] == 'sum':
#         y = 'Number'
#     elif c[1] == 'pct':
#         y = '%'
#     elif c[1] == 'nunique':
#         y = 'N'
#     columns.append(\n.join([x,y]))
# df.columns = columns
df = df.rename(columns={'sum': 'Number', 'pct': '%', 'count': 'Num Mentions'}, level=1)
    
df.to_clipboard()
df


In [None]:
# id_cols = ['DOI', 'ISBN', 'clinical_trial_id', 'URI', 'pubmed_id', 'pmc_id', 'handle', 'ads_bibcode', 'arxiv_id', 'repec_id', 'SSRN', 'URN']
# alt.loc[:,id_cols] = alt.loc[:,id_cols].applymap(lambda x: x.lower() if type(x) == str else x)

In [None]:
gold_wm = gold[gold.identifier.notnull()]
N = len(gold_wm)
print("Gold dataset has {} mentions".format(N))

altmetric_urls = set(alt.URL)
could_match = gold_wm[gold_wm.URL.isin(altmetric_urls)]
n = could_match.shape[0]
print("Of those, {} ({:.0f}%) could have ID match (URLs in altmetric)".format(n, n*100/N))

for i, identifier in enumerate(id_cols):
    df2 = alt[['URL', 'alt_id', identifier]].copy()
    df2.columns = ['URL', 'alt_id', 'matched_alt_id']
    df2.loc[:,'matched_id_type'] = identifier
    merged = gold_wm.merge(df2, left_on=['URL', 'identifier'], right_on=['URL', 'matched_alt_id'])
    if i == 0: 
        matched = merged
    else:
        matched = matched.append(merged)

print("Removing {} duplicate matches.".format(matched.duplicated(subset=['gold_id']).sum()))

matched.drop_duplicates(subset=['gold_id'], inplace=True)  # shouldn't happen, but altmetric has duplicates sometimes

print("Of the {}, {} ({:.0f}%) have a match".format(N, matched.shape[0], matched.shape[0]*100/N))

our_urls_mentions = set(gold[gold.ResearchMentioned == True].URL)
no_chance_match = gold_wm[gold_wm.URL.isin(our_urls_mentions.difference(altmetric_urls))]
n = no_chance_match.shape[0]
print("Of the {}, {} ({:.0f}%) have no chance of matching (URLs not in altmetric)".format(N, n, n*100/N))

In [None]:
alt_errors = alt[(alt.URL.isin(gold[gold.ResearchMentioned == True].URL)) & (~alt.alt_id.isin(matched.alt_id))]

# errs = ['https://www.iflscience.com/health-and-medicine/cannabis-use-early-in-life-linked-to-some-changes-in-heart-and-artery-function', 
#         'https://www.nytimes.com/2021/03/14/health/covid-schools-social-distancing-3-feet.html',
#         'https://www.nytimes.com/2021/04/01/health/pandemics-plague-history-resilience.html',
#         'https://www.nytimes.com/2021/04/07/science/particle-physics-muon-fermilab-brookhaven.html']

# news = ['https://consumer.healthday.com/b-3-31-too-few-minorities-in-u-s-health-care-workforce-report-2651245191.html', 
#         'https://www.iflscience.com/environment/bitcoin-mining-will-soon-pump-out-more-carbon-than-czech-republic-new-study-says', 
#         'https://www.iflscience.com/health-and-medicine/male-fertility-how-everyday-chemicals-are-destroying-sperm-counts-in-humans-and-animals', 
#         'https://www.iflscience.com/plants-and-animals/-a-surprising-number-of-sea-monster-sightings-can-be-explained-by-whale-erections', 
#         'https://www.popsci.com/story/health/astrazeneca-vaccine-blood-clots', 
#         'https://www.popsci.com/story/health/how-vaccine-passport-works', 
#         'https://www.popsci.com/story/health/lyme-disease-treatment-for-humans', 
#         'https://www.theguardian.com/society/2021/apr/02/covid-further-rare-blood-clot-cases-found-in-oxford-astrazeneca-recipients', 
#         'https://www.theguardian.com/society/2021/apr/11/is-vaccinating-against-covid-enough-what-we-can-learn-from-other-countries', 
#         'https://www.wired.com/story/blue-carbon-credits-could-help-restore-ecosystems', 
#         'https://www.wired.com/story/how-cargo-ships-could-help-detect-tsunamis', 
#         'https://www.wired.com/story/how-to-kill-a-zombie-fire']

In [None]:
def _f_score_helper(outlet=False):
    if outlet: 
        g = gold[gold.outlet == outlet]
        a = alt[alt.outlet == outlet]
        m = matched[matched.outlet == outlet]
        e = alt_errors[alt_errors.outlet == outlet]
    else:
        g = gold
        a = alt
        m = matched
        e = alt_errors
            
    return (g,a,m,e)

def _f_score_calculation(tp, fp, fn):
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    
    return (precision, recall, f_score)

        
## BY MENTION

# TRUE POSITIVE: tp = altmetric says yes, we say yes 
# FALSE POSITIVE: fp = altmetric says yes, but we say no 
# FALSE NEGATIVE: fn = almetric says no, we say yes
def f_score_urls(outlet=False, display=False): 
    g,a,m,e = _f_score_helper(outlet)
    
    our_urls_all = set(g.URL)
    our_urls_mentions = set(g[g.ResearchMentioned == True].URL)
    our_urls_no_mentions = set(g[~g.ResearchMentioned == False].URL)
    our_urls_no_mentions = our_urls_no_mentions.difference(our_urls_mentions)  # there's a couple (2) in both sets, because of uncoded duplicates. Remove.
    
    altmetric_urls = set(a.URL)
    
    tp = len(altmetric_urls.intersection(our_urls_mentions))
    fp = len(altmetric_urls.intersection(our_urls_no_mentions))
    fn = len(our_urls_mentions.difference(altmetric_urls))    
    
    precision, recall, f_score = _f_score_calculation(tp, fp, fn)
    
    if display:
        print("N = {}".format(len(our_urls_mentions)))
        print("True Positive: {}".format(tp))
        print("False Positive: {}".format(fp))
        print("False Negative: {}".format(fn))
        print("Precision: {:.2f}".format(precision))
        print("Recall: {:.2f}".format(recall))
        print("F-score: {:.2f}".format(f_score))

    return (f_score, precision, recall)    
    
def f_score_mentions(outlet=False, display=False):
    g,a,m,e = _f_score_helper(outlet)
    
    tp = m.shape[0]
    fp = e.shape[0]
    # fp = len(errs)
    fn = g[g.identifier.notnull()].shape[0] - m.shape[0]

    (precision, recall, f_score) = _f_score_calculation(tp, fp, fn)
    
    if display:
        if outlet: 
            print("Outlet: {}".format(outlet))
        print("True Positive: {}".format(tp))
        print("False Positive: {}".format(fp))
        print("False Negative: {}".format(fn))
        print("Precision: {:.2f}".format(precision))
        print("Recall: {:.2f}".format(recall))
        print("F-score: {:.2f}".format(f_score))

    return (precision, recall, f_score)


Optional code to create an excel sheet with the errors found

In [None]:
# writer = pd.ExcelWriter('errors.xlsx', engine='openpyxl')
# alt_errors.to_excel(writer, 'False Positives')
# gold[(gold.identifier.notnull()) & ~(gold.gold_id.isin(matched.gold_id))].to_excel(writer, 'False Negatives')
# writer.save()

### Table 3. Precision, Recall, and Accuracy (F-score) by news outlet

In [None]:
df = pd.DataFrame(gold.groupby('outlet').size())
df.columns = ['N (mentions)']
df['scores'] = df.index.map(lambda x: f_score_mentions(x, False))
df['Precision'] = df.scores.map(lambda x: "{:.2f}".format(x[0]))
df['Recall'] = df.scores.map(lambda x: "{:.2f}".format(x[1]))
df['F-Score'] = df.scores.map(lambda x: "{:.2f}".format(x[2]))
del df['scores']
# df['scores'] = df['F-score'].map(lambda x: "{:.2f}".format(x))
df.to_clipboard()
df

In [None]:
df = gold[gold.gold_id.isin(matched.gold_id)]
df = df.append(gold[gold.gold_id.isin(no_chance_match.gold_id)])
df['match'] = df.gold_id.isin(matched.gold_id)
df['match'] = df.match.astype(int)

## Logit Model 

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
model = smf.logit(formula="match ~ DescribesAsresearch + HasLink + JournalMentioned + AuthorMentioned + InstitutionMentioned + StudyDateMentioned", data=df)
res = model.fit()

res.params
odds = np.exp(res.params['HasLink'])
prob = '{:.1f}'.format(odds / (1 + odds))

print(res.summary())

In [None]:
params = res.params
conf = res.conf_int()
conf['Odds Ratio'] = params
conf.columns = ['5%', '95%', 'Odds Ratio']
print(np.exp(conf))


np.exp(conf).to_clipboard()