# Cleaning Data

This notebook is used in order to clean the metadata retrieved with the software Arcas.

In [58]:
import glob
import pandas as pd

In [59]:
dfs = []
for filename in glob.glob('../data/Anarchy_*.json'):
    dfs.append(pd.read_json(filename))

In [60]:
df = pd.concat(dfs, ignore_index=True, sort=False)

In [61]:
df.provenance.unique()

array(['PLOS', 'IEEE', 'Springer', 'arXiv', 'Nature'], dtype=object)

In [62]:
len(df.title.unique())

791

In [63]:
len(df.unique_key.unique())

825

In [64]:
provenance_size = df.groupby(['unique_key', 'provenance']).size().reset_index().groupby('provenance').size()
provenance_size

provenance
IEEE        229
Nature      228
PLOS         13
Springer    285
arXiv        70
dtype: int64

In [65]:
df.to_json('../data/anarchy_November_2018.json')

Cleaning
--------

In [66]:
df = pd.read_json('../data/anarchy_November_2018.json')

In [67]:
# Initial all letter in the string author are lowercased.
df.author = df.author.str.lower()

In [68]:
#from fuzzywuzzy import fuzz
import itertools

In [69]:
import tqdm

We can output the names that are very similar but the last check has to be manually.

In [17]:
temp = df

In [18]:
pairs = itertools.combinations(temp.author.unique(), 2)

In [63]:
to_check = []
for i, j in tqdm.tqdm(pairs):
    ratio = fuzz.token_set_ratio(i,j)
    if ratio >=90 and ratio != 100:
        to_check.append((i, j))

17081169it [17:47, 16004.50it/s]


In [64]:
to_check

[('s.cho', 's.chow'),
 ('d.cole', 'd.coyle'),
 ('p.grossman', 'g.grossman'),
 ('p.grossman', 'r.grossman'),
 ('j.campbell', 'o.campbell'),
 ('j.campbell', 't.campbell'),
 ('y.yamaguchi', 'm.yamaguchi'),
 ('k.kanazawa', 's.kanazawa'),
 ('x.han', 'x.shan'),
 ('d.zhao', 'd.hao'),
 ('t.mori', 't.omori'),
 ('e.anderson', 'j.anderson'),
 ('e.anderson', 'p.anderson'),
 ('e.anderson', 'a.anderson'),
 ('e.anderson', 'd.anderson'),
 ('m.seredynski', 'f.seredynski'),
 ('y.nakashima', 't.nakashima'),
 ('y.nakashima', 'y.nagashima'),
 ('y.nakashima', 'y.kashima'),
 ('y.nakashima', 'h.nakashima'),
 ('j.williams', 'm.williams'),
 ('j.williams', 't.williams'),
 ('j.williams', 'v.williams'),
 ('r.sorensen', 'h.sorensen'),
 ('r.sorensen', 't.sorensen'),
 ('s.schuster', 'm.schuster'),
 ('s.salmi', 's.almi'),
 ('c.backer', 'c.baker'),
 ('k.rudnicki', 'r.rudnicki'),
 ('t.zhou', 't.zhu'),
 ('h.sorensen', 't.sorensen'),
 ('c.huia', 'c.hui'),
 ('j.mendez-naya', 'l.mendez-naya'),
 ('a.kuhn', 'a.kun'),
 ('c.cha

In [11]:
df[df['author'] == 'r.grossman']['title'].unique()

array(['Rationale, design and critical end points for the Riluzole in Acute Spinal Cord Injury Study (RISCIS): a randomized, double-blinded, placebo-controlled parallel multi-center trial'],
      dtype=object)

In [8]:
df[df['author'] == 'd.coyle']['title'].unique()

array(['Summer books'], dtype=object)

Duplicate articles
------------------

In [70]:
table = df.groupby(['title', 'unique_key']).size().reset_index().groupby('title').count()
duplicates = table[table['unique_key']>1]
duplicates

Unnamed: 0_level_0,unique_key,0
title,Unnamed: 1_level_1,Unnamed: 2_level_1
CPU Time Pricing,2,2
Communication Networks: Pricing Congestion Control Routing and Scheduling,2,2
Computer Science and Game Theory,2,2
Congestion Games with Linearly Independent Paths: Convergence Time and Price of Anarchy,2,2
Designing Cost-Sharing Methods for Bayesian Games,2,2
Graphical Congestion Games,2,2
Improved Lower Bounds on the Price of Stability of Undirected Network Design Games,2,2
Improving the Price of Anarchy for Selfish Routing via Coordination Mechanisms,2,2
Justice: Theories of,2,2
LP-Based Covering Games with Low Price of Anarchy,2,2


In [71]:
duplicates_title = df[df['title'].isin(duplicates.index)]['title'].unique()

In [72]:
duplicates_in_arxiv = df[(df['title'].isin(duplicates.index)) & (df['provenance'] == 'arXiv')]['title'].unique()

In [73]:
diff = list(set(duplicates_title) - set(duplicates_in_arxiv))

In [74]:
df_without_arxiv = df[~(df['provenance']=='arXiv')]

In [75]:
df_without_arxiv = df_without_arxiv.drop_duplicates(subset='title')

In [76]:
# df_without_arxiv.to_json('../data/pd_November_2018_without_arxiv.json')

**Drop duplicates.**

In [77]:
articles_to_drop = df[(df['title'].isin(duplicates.index)) & (df['provenance']=='arXiv')]['unique_key'].unique()

In [78]:
df = df[~df['unique_key'].isin(articles_to_drop)]

In [79]:
df = df.drop_duplicates(subset='title')

In [80]:
len(df['title'].unique()), len(df['unique_key'].unique())

(791, 791)

In [89]:
df = df[~(df['date'] < 1963)]

**Export clean json.**

In [90]:
df.to_json('../data/anarchy_November_2018_clean.json')