# Cleaning Data PD


This notebook is used in order to clean the metadata retrieved with the software Arcas.

In [1]:
import glob
import pandas as pd


In [2]:
dfs = []
for filename in glob.glob("../data/PD_*.json"):
    dfs.append(pd.read_json(filename))


In [3]:
dfs.append(pd.read_json("../data/bibliography.json"))


In [4]:
df = pd.concat(dfs, ignore_index=True)


In [5]:
df.provenance.unique()


array(['Springer', 'Nature', 'PLOS', 'IEEE', 'arXiv', 'Manual'], dtype=object)

In [6]:
len(df.title.unique()), len(df.unique_key.unique())


(3096, 3193)

In [7]:
provenance_size = (
    df.groupby(["unique_key", "provenance"])
    .size()
    .reset_index()
    .groupby("provenance")
    .size()
)
provenance_size


provenance
IEEE         295
Manual        79
Nature       687
PLOS         482
Springer     576
arXiv       1074
dtype: int64

In [8]:
df = df[~(df["date"] < 1950)]
df = df[~(df["date"] > 2018)]


In [9]:
df = df.replace(to_replace=2021, value=2015)


In [10]:
df.to_json("../data/pd_November_2018.json")


Duplicate articles
------------------

In [11]:
table = (
    df.groupby(["title", "unique_key"]).size().reset_index().groupby("title").count()
)
duplicates = table[table["unique_key"] > 1]


In [12]:
duplicates_title = df[df["title"].isin(duplicates.index)]["title"].unique()


In [13]:
duplicates_in_arxiv = df[
    (df["title"].isin(duplicates.index)) & (df["provenance"] == "arXiv")
]["title"].unique()


In [14]:
diff = list(set(duplicates_title) - set(duplicates_in_arxiv))


In [15]:
df_without_arxiv = df[~(df["provenance"] == "arXiv")]


In [16]:
df_without_arxiv = df_without_arxiv.drop_duplicates(subset="title")


In [17]:
df_without_arxiv.to_json("../data/pd_November_2018_without_arxiv.json")


**Drop duplicates.**

In [18]:
articles_to_drop = df[
    (df["title"].isin(duplicates.index)) & (df["provenance"] == "arXiv")
]["unique_key"].unique()


In [19]:
df = df[~df["unique_key"].isin(articles_to_drop)]


In [20]:
len(df["title"].unique()), len(df["unique_key"].unique())


(3077, 3155)

**Export clean json.**

In [21]:
df.to_json("../data/pd_November_2018_clean.json")


In [22]:
df.to_csv('../data/prisoners_dilemma_articles_meta_data.csv')