# STI 2018 - DOIs, URLs, and FB

Code to produce quantification of three problem cases with the WOS state_of_oa dataset

In [33]:
import pandas as pd
from pprint import pprint
import json
from tqdm import tqdm

import collections
import numpy as np
import itertools
from urllib.parse import urlparse

In [91]:
pd.options.display.float_format = '{:,.1f}'.format

input1 = "data/wos_100k_resolved.csv"
input2 = "data/wos_100k_full.csv"

# Challenge 1

In [None]:
# Load resolved DOIs
resolved_doi = pd.read_csv(input1, index_col="doi")
resolved_doi['domain'] = resolved_doi.resolved.map(lambda x: urlparse(x)[1] if pd.notnull(x) else None)
resolved_doi['prefix'] = resolved_doi.index.map(lambda x: x.split("/")[0])

### DOI Resolving results

In [314]:
l = len(resolved_doi)

a = resolved_doi.status_code.notnull().sum()
b = resolved_doi.status_code.value_counts()[200]
c = len(resolved_doi[resolved_doi.status_code.notnull()]) - a
d = resolved_doi.err.value_counts()['RequestException'] + resolved_doi.err.value_counts()['Timeout']

https_urls = resolved_doi.resolved.map(lambda x: x[4] == "s" if pd.notnull(x) else None).sum()
http_urls = resolved_doi.resolved.notnull().sum()-https_urls

dupl = resolved_doi[resolved_doi.resolved.notnull()].resolved.duplicated(keep=False).sum()

out = pd.DataFrame.from_dict({
    "1. Got response from crossref":[a,a*100/l],
    "2. Resolved with 200s":[b,b*100/l],
    "3. Resolved with error":[c,c*100/l],
    "4. RequestException + TimeOuts":[d,d*100/l],
    "5. Resolved to HTTPS": [https_urls, https_urls*100/l],
    "6. Resolved to HTTP": [http_urls, http_urls*100/l],
    "7. Duplicate URLs": [dupl, dupl*100/l]
}, orient='index')
out.columns = ['Articles', '[%]']
out.sort_index()

Unnamed: 0,Articles,[%]
1. Got response from crossref,91490,88.4
2. Resolved with 200s,85515,82.6
3. Resolved with error,0,0.0
4. RequestException + TimeOuts,12049,11.6
5. Resolved to HTTPS,69619,67.2
6. Resolved to HTTP,21871,21.1
7. Duplicate URLs,68,0.1


## Problem 1 - URL Variations

In [71]:
big_5 = ['linkinghub.elsevier.com',
         'link.springer.com',
         'onlinelibrary.wiley.com',
         'www.tandfonline.com',
         'journals.sagepub.com']

ind = []
for d in big_5:
    ind.append(df[df.domain == d].sample().index[0])
    
a = len(df[df.domain.isin(big_5)])
print("Big 5 DOIs: {} ({:.1f}%)".format(a, 100*a/l))

print("Samples")
df.loc[ind][['status_code', 'resolved', 'domain', 'prefix']].resolved.tolist()

Big 5 DOIs: 55777 (53.9%)
Samples


['https://linkinghub.elsevier.com/retrieve/pii/S1549963414003256',
 'https://link.springer.com/article/10.1007%2Fs10337-010-1883-4',
 'https://onlinelibrary.wiley.com/doi/abs/10.1002/spe.909',
 'https://www.tandfonline.com/doi/full/10.1179/1743282014Y.0000000119',
 'http://journals.sagepub.com/doi/10.1177/1010539513486919']

# Challenge 2

In [211]:
# Consts
ids = ['ogid'+str(i) for i in range (1,5)]
eng = ['eng'+str(i) for i in range (1,5)]
urls = ['url'+str(i) for i in range (1,5)]
# shares = ['shares1','shares2','shares3','shares4']

dtype={}
for i in range(1,5):
    dtype['url'+str(i)] = str
    dtype['og_eng'+str(i)] = str
    dtype['og_obj'+str(i)] = str
    dtype['og_err'+str(i)] = str

url_response = pd.read_csv(input2, index_col="doi", parse_dates=['ts'], dtype=dtype)

# Prepare results
results = url_response[urls].copy()
for i in range(1,5):
    results['ogid'+str(i)] = url_response['og_obj'+str(i)].map(lambda x: json.loads(x)['id'] if pd.notnull(x) else None)
    results['eng'+str(i)] = url_response['og_eng'+str(i)].map(lambda x: sum(json.loads(x).values()) if pd.notnull(x) else None)
    #results['shares'+str(i)] = df['og_eng'+str(i)].map(lambda x: json.loads(x)['share_count'] if pd.notnull(x) else None)

In [180]:
x = results[eng].apply(lambda x: sum(x) > 0, axis=1)
results_eng = results[x]

x = results[ids].apply(lambda x: x.notnull().sum() > 0, axis=1)
results_ids = results[x]

### HTTP/HTTPS URL breakdown for articles with OG object or Eng>0

In [202]:
def get_https_breakdown(df):
    http = 0
    https = 0
    df = df[['url1', 'url2','ogid1', 'ogid2']]
    for row in df.itertuples():
        if row[3]:
            if row[1][4] == "s":
                https = https + 1
            else:
                http = http + 1
        if row[4]:
            if row[2][4] == "s":
                https = https + 1
            else:
                http = http + 1
    return {'http':http, 'https':https}
pd.DataFrame({'IDS':get_https_breakdown(results_ids),
              'ENG':get_https_breakdown(results_eng)}, )

Unnamed: 0,ENG,IDS
http,3821,19901
https,684,1856


### Coverage of 4 URL variations

In [203]:
cov = results[ids].apply(lambda x: x.notnull().sum()).values
cov_eng = results[eng].apply(lambda x: sum(x>0)).values
pd.DataFrame({'IDS':cov,
              'IDS (%)':cov/(len(results)/100),
              'ENG':cov_eng,
              'ENG (%)': cov_eng/(len(results)/100)})

Unnamed: 0,ENG,ENG (%),IDS,IDS (%)
0,1426,1.6,8452,9.2
1,2458,2.7,13305,14.5
2,74,0.1,179,0.2
3,2612,2.9,10124,11.1


## Problem 2 - DOI shares spread across graph objects

In [364]:
def check_pairs(row):
    ids = ['1', '2', '3', '4']
    
    atleastonematching = False
    for c in itertools.combinations(ids, 2):
        x = c[0]
        y = c[1]
        
        # if one of the Ob_IDs is empty, move on
        if row['ogid' + x] is None or row['ogid' + y] is None:
            continue

        # keep track of matching Ob_IDs
        if row['ogid' + x] == row['ogid' + y]:
            atleastonematching = True

            # if we have matching IDs, but non matching values, it is a problem
            if row['eng' + x] != row['eng' + y]:
                return False
    
    if atleastonematching:
        return True

    return None

def check_nonmatching(row):
    ids = ['1', '2', '3', '4']
    
    for c in itertools.combinations(ids, 2):
        x = c[0]
        y = c[1]
        
        # if one of the Ob_IDs is empty, move on
        if row['ogid' + x] is None or row['ogid' + y] is None:
            continue
        
        if row['ogid' + x] != row['ogid' + y]:
            return True
    return False

def prepare_sub_df(df, n):
    x = df[ids].apply(lambda x: x.notnull().sum() == n, axis=1)
    return df[x].copy()

In [383]:
tdf = results_eng

subdfs = []
for i in tqdm(range(0, 5), total=5):
    subdfs.append(prepare_sub_df(tdf, i))

nonmatch_indices=[]
ts = []
fs = []
nms = []
counts = []
    
for df in tqdm(subdfs):
    df['check_pairs'] = df.apply(check_pairs, axis=1)
    x = df.groupby('check_pairs')
    try:
        ts.append(x.size().loc[True])
        fs.append(x.size().loc[False])
    except:
        ts.append(0)
        fs.append(0)
    nms.append(df.apply(check_nonmatching, axis=1).sum())
    counts.append(len(df))

    nonmatch_indices.extend(df[df['check_pairs']==False].index.tolist())

cols = ['Numbers', 'Not matching IDs', 'Matching IDs, Matching Shares', 'Matching IDs, Mismatching Shares']
pd.DataFrame({cols[0]:counts,
              cols[1]:nms,
              cols[2]:ts,
              cols[3]:fs,},
             index=["Zero","One ID","Two IDs","Three IDs","Four IDs"])[cols]

100%|██████████| 5/5 [00:05<00:00,  1.14s/it]
100%|██████████| 5/5 [00:00<00:00,  5.71it/s]


Unnamed: 0,Numbers,Not matching IDs,"Matching IDs, Matching Shares","Matching IDs, Mismatching Shares"
Zero,106,0,0,0
One ID,3687,0,0,0
Two IDs,1535,769,620,146
Three IDs,161,131,99,43
Four IDs,9,8,6,3


In [382]:
tdf = results_ids

subdfs = []
for i in tqdm(range(1, 5), total=4):
    subdfs.append(prepare_sub_df(tdf, i))

nonmatch_indices=[]
ts = []
fs = []
nms = []
counts = []
    
for df in tqdm(subdfs):
    df['check_pairs'] = df.apply(check_pairs, axis=1)
    x = df.groupby('check_pairs')
    try:
        ts.append(x.size().loc[True])
        fs.append(x.size().loc[False])
    except:
        ts.append(0)
        fs.append(0)
    nms.append(df.apply(check_nonmatching, axis=1).sum())
    counts.append(len(df))

    nonmatch_indices.extend(df[df['check_pairs']==False].index.tolist())

cols = ['Numbers', 'Not matching IDs', 'Matching IDs, Matching Shares', 'Matching IDs, Mismatching Shares']
pd.DataFrame({cols[0]:counts,
              cols[1]:nms,
              cols[2]:ts,
              cols[3]:fs,},
             index=["One ID","Two IDs","Three IDs","Four IDs"])[cols]

100%|██████████| 4/4 [00:19<00:00,  4.77s/it]
100%|██████████| 4/4 [00:03<00:00,  1.03it/s]


Unnamed: 0,Numbers,Not matching IDs,"Matching IDs, Matching Shares","Matching IDs, Mismatching Shares"
One ID,21768,0,0,0
Two IDs,4739,1694,2899,146
Three IDs,258,207,191,43
Four IDs,10,9,7,3


## Problem 3 - Same OG IDs across different articles

In [385]:
df = results_ids[ids].copy()

df['all_ids'] = df[ids].apply(lambda x: [int(y) for y in set(x) if pd.notnull(y)], axis=1)
all_ids = df.all_ids.sum()
counter = collections.Counter(all_ids)

dup_ids = set([i for (i,v) in counter.items() if v > 1])
print("Number of duplicate ids: %s" % len(dup_ids))

results_ids['has_dup'] = df.all_ids.map(lambda x: len(dup_ids.intersection(x)) > 0)
print("Number of articles affected: %s" % results_ids.has_dup.sum())

Number of duplicate ids: 66
Number of articles affected: 507


# Summary of 3 problem cases

In [387]:
test = pd.DataFrame({'a':[1,2,3,4,5,6]})
test2 = pd.DataFrame({'b':[True, None, None, False]})

In [393]:
test.merge(test2, how="left", left_index=True, right_index=True)

Unnamed: 0,a,b
0,1,True
1,2,
2,3,
3,4,False
4,5,
5,6,


In [413]:
resolved_doi['problem_1'] = resolved_doi.resolved.isnull()
resolved_doi['problem_2'] = resolved_doi.merge(results_ids[['has_dup']], how="left", left_index=True, right_index=True)['has_dup']
resolved_doi['problem_3'] = None
resolved_doi.loc[nonmatch_indices, 'problem_3'] = True

In [416]:
resolved_doi[['problem_1', 'problem_2', 'problem_3']].sum()

problem_1   12,049.0
problem_2      507.0
problem_3      192.0
dtype: float64

In [419]:
a = resolved_doi[['problem_1', 'problem_2', 'problem_3']].apply(lambda x: x.any(), axis=1).sum()
b = resolved_doi.loc[results_eng.index][['problem_2', 'problem_3']].apply(lambda x: x.any(), axis=1).sum()

print("Problem case 1, 2, and 3 among all articles: {} ({:.1f}%)".format(a, 100*a/len(resolved_doi)))
print("Problem case 2, and 3 among articles w/ eng: {} ({:.1f}%)".format(b, 100*b/len(results_eng)))

Problem case 1, 2, and 3 among all articles: 12722 (12.3%)
Problem case 2, and 3 among articles w/ eng: 648 (11.8%)


# Interesting DOIs

**10.1007/s00586-013-2675-y**

Same OG ID for http/https but different one for DOI. Different share numbers

**10.1038/nature13893**

Different OG objects

**10.7717/peerj.794**

Same share numbers, different OG IDs

**10.1016/j.aap.2014.03.007**

Elsevier redirect page, various OG IDs

**10.7440/res53.2015.10**

Various IDs across DOI, URL

## Used code

```
doi = '10.7440/res53.2015.10'
rec = df.loc[doi]
urls = [rec.url, rec.url2, "https://doi.org/%s" % doi, "http://dx.doi.org/%s" % doi]
pprint(fb_queries(urls))
```

## Problem with missing scraped data

Results for the http and https for journals.ametsoc.org/doi/abs/10.1175/JAS-D-12-0315.1 

URL    | OG ID            | Shares | Date (scrape)
-------|------------------|--------|---------------
HTTP   | 685490234864647  | 2      | September 30, 2016
HTTPS  | None             | None   | None

After manually triggering a rescrape:

URL    | OG ID            | Shares | Date (scrape)
-------|------------------|--------|---------------
HTTP   | 1818366768210382 | 0      | March 28, 2018
HTTPS  | 1818366768210382 | 0      | March 28, 2018

Apparently shares associated with previous canonical URLs are lost...