In [1]:
import pandas as pd
from pathlib import Path

In [2]:
def clean_data(df):
    """
    Removes the `columns_to_drop` from the data set.
    """
    columns_to_drop = ['url', 'key', 'doi', 'provenance',
                       'category', 'score', 'open_access',
                       'abstract', 'index',]
    df = df.drop(columns=columns_to_drop)
    df = df.dropna(subset=['author'])
    df = df.drop_duplicates()
    return df

**Maths Data**

In [3]:
springer = []
for filename in Path('').glob('WiM_data/springer*.json'):
    springer.append(pd.read_json(filename))
    
springer = pd.concat(springer).reset_index()
springer['source'] = 'Springer'
springer = springer.drop(columns=['level_0'])

In [4]:
ieee = pd.read_json('WiM_data/ieee_women_data.json')
ieee['source'] = 'Ieee'

In [5]:
arxiv = []
for filename in Path('').glob('WiM_data/arxiv*.json'):
    arxiv.append(pd.read_json(filename))

arxiv = pd.concat(arxiv).reset_index()
arxiv['source'] = 'arxiv'
arxiv = arxiv.drop(columns=['level_0'])

**clean arxiv names**

In [6]:
names = []
for author in arxiv['author']:
    decomposed = author.split(' ')
    if decomposed[0] == '':
        name = ' '.join(decomposed[1:])
        names.append(name)
    else:
        names.append(author)

In [7]:
len(names)

21134

In [8]:
len(arxiv)

21134

In [9]:
arxiv['author'] = names

In [10]:
dfs = [clean_data(df) for df in [springer, ieee, arxiv]]

In [11]:
df = pd.concat(dfs, sort=False)

In [12]:
df = df.drop(columns=['journal', 'primary_category'])

In [13]:
df = df.drop_duplicates()

In [14]:
df = df.reset_index()

In [15]:
len(df)

37948

In [16]:
df.to_json('WiM_data/publications_in_mathematics.json')

**Psychology**

In [17]:
springer = []
for filename in Path('').glob('WiM_data/Psy/springer*.json'):
    springer.append(pd.read_json(filename))

In [18]:
springer = pd.concat(springer).reset_index()

In [19]:
springer['source'] = 'Springer'
springer = springer.drop(columns=['level_0'])

In [20]:
len(springer['unique_key'].unique())

9969

In [21]:
ieee = pd.read_json('WiM_data/ieee_women_data_psychology.json')

In [22]:
dfs = [clean_data(df) for df in [springer, ieee]]

In [23]:
df = pd.concat(dfs, sort=False)

In [24]:
df = df.drop_duplicates()

In [25]:
df = df.reset_index()

In [26]:
df.to_json('WiM_data/publications_in_psychology.json')