Last updated: March 31, 2023
Last run: April, 2021

**Data Collection**

## Second-order Effects in Altmetrics: A Case Study Analyzing the Audiences of COVID-19 Research in the News and on Social Media

Juan Pablo Alperin, Alice Fleerackers, Michelle Riedlinger & Stefanie Haustein

**Related Publication:**
Alperin, J.P., Fleerackers, A., Riedlinger, M. & Haustein, S. (2023). Second-order Effects in Altmetrics: A Case Study Analyzing the Audiences of COVID-19 Research in the News and on Social Media. *Zenodo*. 

*Caveat: This code was cleaned up from its messy version that required solving many small data collection glitches in the original version. In particular, the original collection had some issues with twitter id's and twitter user id's being recorded in scientific notation. As such, it may not work perfectly.*

*The code does faithfully captures the main approach and code used for data collection.*

In [None]:
import datetime
import pandas as pd
import requests

from tqdm.auto import tqdm
tqdm.pandas()

from urllib.parse import unquote

from pymed import PubMed
import random

## Create a file with the possible URLs for each article
Begins by finding a DOI for each Pubmed ID, then resolves (unshortens) the DOI URL.

In [None]:
# Input a list of all the Pubmed IDs from our query
articles = pd.read_csv('data/covid_pubmed_ids-20210223.csv', header=None)
articles.columns = ['pmid']

def get_doi(pmid):
    randint = random.randint(0,10000)
    email = 'nospam+%s@alperin.ca' % randint
    pubmed = PubMed(tool="research", email=email)    
    results = list(pubmed.query(pmid, max_results=1))
    try: 
        article = results[0]
        return article.doi
    except: 
        return None
    

In [None]:
# Fetches the DOI for each using the Pubmed API
articles['doi'] = articles.pmid.progress_apply(get_doi)

In [None]:
def unshort(url):
    try:
        r = requests.get(url, allow_redirects=True, timeout=15)
        return r.url
    except:
        return None

In [None]:
# Take the DOI URL and resolve it to find out what it links to
articles['resolved_url'] = articles.doi.progress_apply(lambda doi: unshort('https://doi.org/%s' % doi))


In [None]:
articles['doi_url1'] = articles.doi.map(lambda doi: 'https://doi.org/%s' % doi)

In [None]:
# Output file. This file was used as input for Crowdtangle Queries
articles.to_csv('data/covid_dois_in_4_outlets_with_urls.csv', index=False)

In [None]:
outlets_of_interest = ['MSN', 'New York Times', 'BBC News', 'The Guardian', 'Washington Post']
domains_of_interest = ['www.msn.com', 'www.nytimes.com', 'www.bbc.com', 'www.theguardian.com', 'www.washingtonpost.com']

## Collect Twitter activity using Twint
All tweets collected are placed in a "tweets" folder. 
(these cannot be made publicly available and would need to be collected again)

In [None]:
import twint
import nest_asyncio
nest_asyncio.apply() # makes things go faster by doing async searches


In [None]:
# Configure
def twint_search(url, outfile = False):
    try:
        c = twint.Config()
        # Search for Everything in 2020 PLUS INCLUDE January 2021
        c.Search = "%s since:2020-01-01 until:2021-02-01 filter:links" % unquote(url)
        print(unquote(url))
        c.Pandas = True
        c.Hide_output = True

        # Run
        twint.run.Search(c)
        
        search_results = twint.storage.panda.Tweets_df

        print(search_results.shape)

        if outfile and search_results.shape[0] > 0: 
            try: 
                tweets = pd.read_csv(outfile, dtype={'tweet_id': str, 'user_id_str': str}, low_memory=False)
                tweets = tweets.append(search_results, ignore_index=True)
                
            except:
                tweets = search_results
                
            tweets.to_csv(outfile, index=False)
            
        return search_results.shape[0]
    except KeyboardInterrupt:
        raise
    except:
        print("Error: %s" % url)
        return None

## Collect Tweets about News Stories

In [None]:
# input file from Altmetric Explorer query
df = pd.read_csv('data/altmetric_news_mentions.csv')

df['url_clean'] = df.URL.map(lambda x: x[:x.find('?')].strip('/') if x.find('?') > 0 else x.strip('/'))
df['domain'] = df.URL.map(lambda x: x.split('/')[2])
df = df[df.domain.isin(domains_of_interest)]

story_urls = df[['outlet', 'URL', 'url_clean']].drop_duplicates(subset='url_clean')

# story_urls.to_excel('data/altmetric_unique_story_urls_top5outlets.xlsx', index=False)


In [None]:
# Collect all the tweets, one outlet at a time just to keep things tidier

for outlet in outlets_of_interest:
    tweets = None
    now = datetime.datetime.now().strftime('%Y%m%d_%H%M')
    outfile = 'tweets/%s_tweets_%s.csv' % (outlet.lower().replace(' ', '_'), now)

    print('Going after %s' % outlet)
    story_urls.loc[story_urls.outlet == outlet, 'num_tweets'] = story_urls[story_urls.outlet == outlet].url_clean.progress_apply(lambda url: twint_search(url, outfile))
    
## The final file used in research is a merger of all of the output files from this step


## Collect Tweets about research 

In [None]:
all_urls = set(articles.resolved_url).union(articles.doi_url1)

df = articles.drop_duplicates(subset='doi_url1')
tweets = None

outfile = 'tweets/research_tweets.csv'

# this collects the tweets and saves the num found for each one
# tweets themselves are saved to the outfile
df['num_tweets'] = df.doi_url1.progress_apply(lambda url: twint_search(url, outfile))

In [None]:
tweets = pd.read_csv('tweets/research_tweets.csv')
tweets.dropna(subset=['search'], inplace=True)
tweets['url_clean'] = tweets.search.map(lambda x: x.split(' ')[0])

In [None]:
tweets = tweets[tweets.search.notna()]

tweets['url_clean'] = tweets.search.map(lambda x: x.split(' ')[0])
tweets.id = tweets.id.astype(str)
tweets = tweets[tweets['id'].notna()]
tweets.set_index('id', inplace=True)
tweets.index.name = 'tweet_id'
tweets.to_csv('citations/twint_research_url_mentions.csv')