# STI 2018 - DOIs, URLs, and FB

Code to produce quantification of three problem cases with the WOS state_of_oa dataset

In [1]:
import pandas as pd
from pprint import pprint
import json
from tqdm import tqdm

import collections
import numpy as np
import itertools
from urllib.parse import urlparse

In [12]:
pd.options.display.float_format = '{:,.1f}'.format

input1 = "data/crossref_100k_resolved.csv"
input2 = "data/crossref_100k_full.csv"

# Challenge 1

In [6]:
# Load resolved DOIs
resolved_doi = pd.read_csv(input1, index_col="doi")
resolved_doi['domain'] = resolved_doi.resolved.map(lambda x: urlparse(x)[1] if pd.notnull(x) else None)
resolved_doi['prefix'] = resolved_doi.index.map(lambda x: x.split("/")[0])

### DOI Resolving results

In [7]:
l = len(resolved_doi)

a = resolved_doi.status_code.notnull().sum()
b = resolved_doi.status_code.value_counts()[200]
c = len(resolved_doi[resolved_doi.status_code.notnull()]) - a
d = resolved_doi.err.value_counts()['RequestException'] + resolved_doi.err.value_counts()['Timeout']

https_urls = resolved_doi.resolved.map(lambda x: x[4] == "s" if pd.notnull(x) else None).sum()
http_urls = resolved_doi.resolved.notnull().sum()-https_urls

dupl = resolved_doi[resolved_doi.resolved.notnull()].resolved.duplicated(keep=False).sum()

out = pd.DataFrame.from_dict({
    "1. Got response from crossref":[a,a*100/l],
    "2. Resolved with 200s":[b,b*100/l],
    "3. Resolved with error":[c,c*100/l],
    "4. RequestException + TimeOuts":[d,d*100/l],
    "5. Resolved to HTTPS": [https_urls, https_urls*100/l],
    "6. Resolved to HTTP": [http_urls, http_urls*100/l],
    "7. Duplicate URLs": [dupl, dupl*100/l]
}, orient='index')
out.columns = ['Articles', '[%]']
out.sort_index()

Unnamed: 0,Articles,[%]
1. Got response from crossref,86706,86.7
2. Resolved with 200s,68743,68.7
3. Resolved with error,0,0.0
4. RequestException + TimeOuts,13290,13.3
5. Resolved to HTTPS,53437,53.4
6. Resolved to HTTP,33269,33.3
7. Duplicate URLs,313,0.3


## Problem 1 - URL Variations

In [10]:
big_5 = ['linkinghub.elsevier.com',
         'link.springer.com',
         'onlinelibrary.wiley.com',
         'www.tandfonline.com',
         'journals.sagepub.com']

ind = []
for d in big_5:
    ind.append(resolved_doi[resolved_doi.domain == d].sample().index[0])
    
a = len(resolved_doi[resolved_doi.domain.isin(big_5)])
print("Big 5 DOIs: {} ({:.1f}%)".format(a, 100*a/l))

print("Samples")
resolved_doi.loc[ind][['status_code', 'resolved', 'domain', 'prefix']].resolved.tolist()

Big 5 DOIs: 45575 (45.6%)
Samples


['https://linkinghub.elsevier.com/retrieve/pii/S0002939499004158',
 'https://link.springer.com/article/10.1007%2FBF00306977',
 'https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1440-1746.2005.04155.x',
 'http://www.tandfonline.com/doi/full/10.1517/14712598.2012.679654',
 'http://journals.sagepub.com/doi/10.1177/103841629300200107']

# Challenge 2

In [13]:
# Consts
ids = ['ogid'+str(i) for i in range (1,5)]
eng = ['eng'+str(i) for i in range (1,5)]
urls = ['url'+str(i) for i in range (1,5)]
# shares = ['shares1','shares2','shares3','shares4']

dtype={}
for i in range(1,5):
    dtype['url'+str(i)] = str
    dtype['og_eng'+str(i)] = str
    dtype['og_obj'+str(i)] = str
    dtype['og_err'+str(i)] = str

url_response = pd.read_csv(input2, index_col="doi", parse_dates=['ts'], dtype=dtype)

# Prepare results
results = url_response[urls].copy()
for i in range(1,5):
    results['ogid'+str(i)] = url_response['og_obj'+str(i)].map(lambda x: json.loads(x)['id'] if pd.notnull(x) else None)
    results['eng'+str(i)] = url_response['og_eng'+str(i)].map(lambda x: sum(json.loads(x).values()) if pd.notnull(x) else None)
    #results['shares'+str(i)] = df['og_eng'+str(i)].map(lambda x: json.loads(x)['share_count'] if pd.notnull(x) else None)

In [14]:
x = results[eng].apply(lambda x: sum(x) > 0, axis=1)
results_eng = results[x]

x = results[ids].apply(lambda x: x.notnull().sum() > 0, axis=1)
results_ids = results[x]

### HTTP/HTTPS URL breakdown for articles with OG object or Eng>0

In [16]:
def get_https_breakdown(df):
    http = 0
    https = 0
    df = df[['url1', 'url2','ogid1', 'ogid2']]
    for row in df.itertuples():
        if row[3]:
            if row[1][4] == "s":
                https = https + 1
            else:
                http = http + 1
        if row[4]:
            if row[2][4] == "s":
                https = https + 1
            else:
                http = http + 1
    return {'http':http, 'https':https}
pd.DataFrame({'IDS':get_https_breakdown(results_ids),
              'ENG':get_https_breakdown(results_eng)}, )

Unnamed: 0,ENG,IDS
http,1659,9125
https,377,1301


### Coverage of 4 URL variations

In [17]:
cov = results[ids].apply(lambda x: x.notnull().sum()).values
cov_eng = results[eng].apply(lambda x: sum(x>0)).values
pd.DataFrame({'IDS':cov,
              'IDS (%)':cov/(len(results)/100),
              'ENG':cov_eng,
              'ENG (%)': cov_eng/(len(results)/100)})

Unnamed: 0,ENG,ENG (%),IDS,IDS (%)
0,1099,1.3,5727,6.6
1,807,0.9,4699,5.4
2,10,0.0,49,0.1
3,588,0.7,3834,4.4


## Problem 2 - DOI shares spread across graph objects

In [18]:
def check_pairs(row):
    ids = ['1', '2', '3', '4']
    
    atleastonematching = False
    for c in itertools.combinations(ids, 2):
        x = c[0]
        y = c[1]
        
        # if one of the Ob_IDs is empty, move on
        if row['ogid' + x] is None or row['ogid' + y] is None:
            continue

        # keep track of matching Ob_IDs
        if row['ogid' + x] == row['ogid' + y]:
            atleastonematching = True

            # if we have matching IDs, but non matching values, it is a problem
            if row['eng' + x] != row['eng' + y]:
                return False
    
    if atleastonematching:
        return True

    return None

def check_nonmatching(row):
    ids = ['1', '2', '3', '4']
    
    for c in itertools.combinations(ids, 2):
        x = c[0]
        y = c[1]
        
        # if one of the Ob_IDs is empty, move on
        if row['ogid' + x] is None or row['ogid' + y] is None:
            continue
        
        if row['ogid' + x] != row['ogid' + y]:
            return True
    return False

def prepare_sub_df(df, n):
    x = df[ids].apply(lambda x: x.notnull().sum() == n, axis=1)
    return df[x].copy()

In [21]:
tdf = results_eng

subdfs = []
for i in tqdm(range(0, 5), total=5):
    subdfs.append(prepare_sub_df(tdf, i))

nonmatch_indices=[]
ts = []
fs = []
nms = []
counts = []
    
for df in tqdm(subdfs):
    df['check_pairs'] = df.apply(check_pairs, axis=1)
    x = df.groupby('check_pairs')
    try:
        ts.append(x.size().loc[True])
        fs.append(x.size().loc[False])
    except:
        ts.append(0)
        fs.append(0)
    nms.append(df.apply(check_nonmatching, axis=1).sum())
    counts.append(len(df))

    nonmatch_indices.extend(df[df['check_pairs']==False].index.tolist())

cols = ['Numbers', 'Not matching IDs', 'Matching IDs, Matching Shares', 'Matching IDs, Mismatching Shares']
pd.DataFrame({cols[0]:counts,
              cols[1]:nms,
              cols[2]:ts,
              cols[3]:fs,},
             index=["Zero","One ID","Two IDs","Three IDs","Four IDs"])[cols]

100%|██████████| 5/5 [00:02<00:00,  2.23it/s]
100%|██████████| 5/5 [00:00<00:00, 14.22it/s]


Unnamed: 0,Numbers,Not matching IDs,"Matching IDs, Matching Shares","Matching IDs, Mismatching Shares"
Zero,65,0,0,0
One ID,1447,0,0,0
Two IDs,542,229,176,137
Three IDs,41,33,22,10
Four IDs,1,0,0,0


In [20]:
tdf = results_ids

subdfs = []
for i in tqdm(range(1, 5), total=4):
    subdfs.append(prepare_sub_df(tdf, i))

nonmatch_indices=[]
ts = []
fs = []
nms = []
counts = []
    
for df in tqdm(subdfs):
    df['check_pairs'] = df.apply(check_pairs, axis=1)
    x = df.groupby('check_pairs')
    try:
        ts.append(x.size().loc[True])
        fs.append(x.size().loc[False])
    except:
        ts.append(0)
        fs.append(0)
    nms.append(df.apply(check_nonmatching, axis=1).sum())
    counts.append(len(df))

    nonmatch_indices.extend(df[df['check_pairs']==False].index.tolist())

cols = ['Numbers', 'Not matching IDs', 'Matching IDs, Matching Shares', 'Matching IDs, Mismatching Shares']
pd.DataFrame({cols[0]:counts,
              cols[1]:nms,
              cols[2]:ts,
              cols[3]:fs,},
             index=["One ID","Two IDs","Three IDs","Four IDs"])[cols]

100%|██████████| 4/4 [00:10<00:00,  2.62s/it]
100%|██████████| 4/4 [00:02<00:00,  1.95it/s]


Unnamed: 0,Numbers,Not matching IDs,"Matching IDs, Matching Shares","Matching IDs, Mismatching Shares"
One ID,10274,0,0,0
Two IDs,1630,582,911,137
Three IDs,257,57,236,10
Four IDs,1,0,0,0


## Problem 3 - Same OG IDs across different articles

In [22]:
df = results_ids[ids].copy()

df['all_ids'] = df[ids].apply(lambda x: [int(y) for y in set(x) if pd.notnull(y)], axis=1)
all_ids = df.all_ids.sum()
counter = collections.Counter(all_ids)

dup_ids = set([i for (i,v) in counter.items() if v > 1])
print("Number of duplicate ids: %s" % len(dup_ids))

results_ids['has_dup'] = df.all_ids.map(lambda x: len(dup_ids.intersection(x)) > 0)
print("Number of articles affected: %s" % results_ids.has_dup.sum())

Number of duplicate ids: 26
Number of articles affected: 343


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


# Summary of 3 problem cases

In [24]:
resolved_doi['problem_1'] = resolved_doi.resolved.isnull()
resolved_doi['problem_2'] = resolved_doi.merge(results_ids[['has_dup']], how="left", left_index=True, right_index=True)['has_dup']
resolved_doi['problem_3'] = None
resolved_doi.loc[nonmatch_indices, 'problem_3'] = True

In [25]:
resolved_doi[['problem_1', 'problem_2', 'problem_3']].sum()

problem_1   13,294.0
problem_2      343.0
problem_3      148.0
dtype: float64

In [26]:
a = resolved_doi[['problem_1', 'problem_2', 'problem_3']].apply(lambda x: x.any(), axis=1).sum()
b = resolved_doi.loc[results_eng.index][['problem_2', 'problem_3']].apply(lambda x: x.any(), axis=1).sum()

print("Problem case 1, 2, and 3 among all articles: {} ({:.1f}%)".format(a, 100*a/len(resolved_doi)))
print("Problem case 2, and 3 among articles w/ eng: {} ({:.1f}%)".format(b, 100*b/len(results_eng)))

Problem case 1, 2, and 3 among all articles: 13705 (13.7%)
Problem case 2, and 3 among articles w/ eng: 345 (16.5%)
