# Cleaning Data

This notebook is used in order to clean the metadata retrieved with the software Arcas.

In [8]:
import glob
import pandas as pd

In [9]:
dfs = []
for filename in glob.glob('../data/auction_*.json'):
    dfs.append(pd.read_json(filename))

In [10]:
df = pd.concat(dfs, ignore_index=True, sort=False)

In [11]:
df.provenance.unique()

array(['Nature', 'Springer', 'IEEE', 'arXiv'], dtype=object)

In [12]:
len(df.title.unique())

1387

In [13]:
len(df.unique_key.unique())

1440

In [7]:
provenance_size = df.groupby(['unique_key', 'provenance']).size().reset_index().groupby('provenance').size()
provenance_size

provenance
IEEE         10
Nature      194
Springer    945
arXiv       291
dtype: int64

In [9]:
df.to_json('../data/auction_November_2018.json')

Cleaning
--------

In [10]:
df = pd.read_json('../data/auction_November_2018.json')

In [11]:
# Initial all letter in the string author are lowercased.
df.author = df.author.str.lower()

In [12]:
#from fuzzywuzzy import fuzz
import itertools

In [13]:
import tqdm

We can output the names that are very similar but the last check has to be manually.

In [14]:
temp = df

In [15]:
pairs = itertools.combinations(temp.author.unique(), 2)

In [63]:
to_check = []
for i, j in tqdm.tqdm(pairs):
    ratio = fuzz.token_set_ratio(i,j)
    if ratio >=90 and ratio != 100:
        to_check.append((i, j))

17081169it [17:47, 16004.50it/s]


In [64]:
to_check

[('s.cho', 's.chow'),
 ('d.cole', 'd.coyle'),
 ('p.grossman', 'g.grossman'),
 ('p.grossman', 'r.grossman'),
 ('j.campbell', 'o.campbell'),
 ('j.campbell', 't.campbell'),
 ('y.yamaguchi', 'm.yamaguchi'),
 ('k.kanazawa', 's.kanazawa'),
 ('x.han', 'x.shan'),
 ('d.zhao', 'd.hao'),
 ('t.mori', 't.omori'),
 ('e.anderson', 'j.anderson'),
 ('e.anderson', 'p.anderson'),
 ('e.anderson', 'a.anderson'),
 ('e.anderson', 'd.anderson'),
 ('m.seredynski', 'f.seredynski'),
 ('y.nakashima', 't.nakashima'),
 ('y.nakashima', 'y.nagashima'),
 ('y.nakashima', 'y.kashima'),
 ('y.nakashima', 'h.nakashima'),
 ('j.williams', 'm.williams'),
 ('j.williams', 't.williams'),
 ('j.williams', 'v.williams'),
 ('r.sorensen', 'h.sorensen'),
 ('r.sorensen', 't.sorensen'),
 ('s.schuster', 'm.schuster'),
 ('s.salmi', 's.almi'),
 ('c.backer', 'c.baker'),
 ('k.rudnicki', 'r.rudnicki'),
 ('t.zhou', 't.zhu'),
 ('h.sorensen', 't.sorensen'),
 ('c.huia', 'c.hui'),
 ('j.mendez-naya', 'l.mendez-naya'),
 ('a.kuhn', 'a.kun'),
 ('c.cha

In [11]:
df[df['author'] == 'r.grossman']['title'].unique()

array(['Rationale, design and critical end points for the Riluzole in Acute Spinal Cord Injury Study (RISCIS): a randomized, double-blinded, placebo-controlled parallel multi-center trial'],
      dtype=object)

In [8]:
df[df['author'] == 'd.coyle']['title'].unique()

array(['Summer books'], dtype=object)

Duplicate articles
------------------

In [16]:
table = df.groupby(['title', 'unique_key']).size().reset_index().groupby('title').count()
duplicates = table[table['unique_key']>1]
duplicates

Unnamed: 0_level_0,unique_key,0
title,Unnamed: 1_level_1,Unnamed: 2_level_1
\n,2,2
Ad Exchange: Envy-Free Auctions with Mediators,2,2
An Online Multi-unit Auction with Improved Competitive Ratio,2,2
Analyses of Cardinal Auctions,2,2
Auctions,11,11
Auctions (Applications),3,3
Auctions (Empirics),3,3
Auctions (Experiments),3,3
Auctions (Theory),3,3
Bayesian Auctions with Friends and Foes,2,2


In [17]:
duplicates_title = df[df['title'].isin(duplicates.index)]['title'].unique()

In [18]:
duplicates_in_arxiv = df[(df['title'].isin(duplicates.index)) & (df['provenance'] == 'arXiv')]['title'].unique()

In [19]:
diff = list(set(duplicates_title) - set(duplicates_in_arxiv))

In [20]:
df_without_arxiv = df[~(df['provenance']=='arXiv')]

In [21]:
df_without_arxiv = df_without_arxiv.drop_duplicates(subset='title')

In [22]:
# df_without_arxiv.to_json('../data/pd_November_2018_without_arxiv.json')

**Drop duplicates.**

In [23]:
articles_to_drop = df[(df['title'].isin(duplicates.index)) & (df['provenance']=='arXiv')]['unique_key'].unique()

In [24]:
df = df[~df['unique_key'].isin(articles_to_drop)]

In [25]:
df = df.drop_duplicates(subset='title')

In [26]:
len(df['title'].unique()), len(df['unique_key'].unique())

(1386, 1386)

**Export clean json.**

In [27]:
df.to_json('../data/auction_November_2018_clean.json')