# Cleaning Data


In [1]:
import glob
import pandas as pd


In [2]:
dfs = []
for filename in glob.glob("../data/Auction_*.json"):
    dfs.append(pd.read_json(filename))


In [3]:
df = pd.concat(dfs, ignore_index=True)


In [4]:
df.provenance.unique()


array(['Springer', 'IEEE', 'arXiv', 'Nature'], dtype=object)

In [5]:
len(df.title.unique()), len(df.unique_key.unique())


(3461, 3569)

In [6]:
provenance_size = (
    df.groupby(["unique_key", "provenance"])
    .size()
    .reset_index()
    .groupby("provenance")
    .size()
)
provenance_size


provenance
IEEE         257
Nature       213
Springer    1301
arXiv       1798
dtype: int64

In [7]:
df = df[~(df["date"] < 1974)]
df = df[~(df["date"] > 2018)]


In [8]:
df.to_json("../data/auction_November_2018.json")


In [9]:
df.author = df.author.str.lower()


Duplicate articles
------------------

In [10]:
table = (
    df.groupby(["title", "unique_key"]).size().reset_index().groupby("title").count()
)
duplicates = table[table["unique_key"] > 1]


In [11]:
duplicates_title = df[df["title"].isin(duplicates.index)]["title"].unique()


In [12]:
duplicates_in_arxiv = df[
    (df["title"].isin(duplicates.index)) & (df["provenance"] == "arXiv")
]["title"].unique()


In [13]:
diff = list(set(duplicates_title) - set(duplicates_in_arxiv))


In [14]:
df_without_arxiv = df[~(df["provenance"] == "arXiv")]


In [15]:
df_without_arxiv = df_without_arxiv.drop_duplicates(subset="title")


In [16]:
# df_without_arxiv.to_json('../data/pd_November_2018_without_arxiv.json')


**Drop duplicates.**

In [17]:
articles_to_drop = df[
    (df["title"].isin(duplicates.index)) & (df["provenance"] == "arXiv")
]["unique_key"].unique()


In [18]:
df = df[~df["unique_key"].isin(articles_to_drop)]


In [19]:
len(df["title"].unique()), len(df["unique_key"].unique())


(3444, 3525)

In [20]:
df = df[~(df["date"] < 1974)]


**Export clean json.**

In [21]:
df.to_json("../data/auction_November_2018_clean.json")


In [22]:
df.to_csv("../data/price_of_anarchy_articles_meta_data.csv")