# Setup

In [2]:
import numpy as np
import pandas as pd
import json, os

# Data Fetching

In [3]:
!git clone https://github.com/allenai/PeerRead.git

Cloning into 'PeerRead'...
remote: Enumerating objects: 27026, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 27026 (delta 0), reused 0 (delta 0), pack-reused 27023[K
Receiving objects: 100% (27026/27026), 1.15 GiB | 25.90 MiB/s, done.
Resolving deltas: 100% (1466/1466), done.
Checking out files: 100% (25772/25772), done.


# Data Processing

In [4]:
def get_all_json(path):
  dev_parsed = {}
  dev_reviews = {}
  test_parsed = {}
  test_reviews = {}
  train_parsed = {}
  train_reviews = {}
  for subdir, dirs, files in os.walk(path):
    for f in files:
      path_name = os.path.join(subdir, f)
      id = path_name.split('/')[-1].split('.')[0]
      if path_name.endswith('.pdf.json'):
        if 'dev' in path_name:
          dev_parsed[id] = json.load(open(path_name, 'r'))
        elif 'test' in path_name:
          test_parsed[id] = json.load(open(path_name, 'r'))
        elif 'train' in path_name:
          train_parsed[id] = json.load(open(path_name, 'r'))
      elif path_name.endswith('.json'):
        if 'dev' in path_name:
          dev_reviews[id] = json.load(open(path_name, 'r'))
        elif 'test' in path_name:
          test_reviews[id] = json.load(open(path_name, 'r'))
        elif 'train' in path_name:
          train_reviews[id] = json.load(open(path_name, 'r'))
  return dev_parsed, dev_reviews, test_parsed, test_reviews, train_parsed, train_reviews

acl_data = get_all_json('PeerRead/data/acl_2017')
cs_ai_data = get_all_json('PeerRead/data/arxiv.cs.ai_2007-2017')
cs_cl_data = get_all_json('PeerRead/data/arxiv.cs.cl_2007-2017')
cs_lg_data = get_all_json('PeerRead/data/arxiv.cs.lg_2007-2017')
conll_data = get_all_json('PeerRead/data/conll_2016')
iclr_data = get_all_json('PeerRead/data/iclr_2017')
nips_data = get_all_json('PeerRead/data/nips_2013-2017')

# Creating the DataFrames

In [46]:
def create_dataframe(json_obj):
  modified_arr = []
  for id in json_obj.keys():
    added_fields = {'id': id}
    fields = json_obj[id]['metadata'].keys()
    if 'references' in fields and json_obj[id]['metadata']['references'] is not None:
      added_fields['numReferences'] = len(json_obj[id]['metadata']['references'])
    if 'referenceMentions' in fields and json_obj[id]['metadata']['referenceMentions'] is not None:
      added_fields['numReferenceMentions'] = len(json_obj[id]['metadata']['referenceMentions'])
    modified_arr.append({**added_fields, **clean_metadata(json_obj[id]['metadata'])})
  return pd.DataFrame(modified_arr)

def create_review_dataframe(json_obj):
  modified_arr = []
  for id in json_obj.keys():
    fields = json_obj[id].keys()
    reviews = json_obj[id]['reviews']
    for i, review in enumerate(reviews):
      added_fields = {'id': id, 'reviewNum': i + 1}
      modified_arr.append({**added_fields, **clean_metadata(review)})
  return pd.DataFrame(modified_arr)

def clean_metadata(metadata):
  if 'is_meta_review' in metadata.keys():
    if metadata['is_meta_review'] is None:
      metadata['is_meta_review'] = False
  return metadata

acl_data_dev_df = create_dataframe(acl_data[0])
acl_review_dev_df = create_review_dataframe(acl_data[1])
acl_data_test_df = create_dataframe(acl_data[2])
acl_review_test_df = create_review_dataframe(acl_data[3])
acl_data_train_df = create_dataframe(acl_data[4])
acl_review_train_df = create_review_dataframe(acl_data[5])
acl_review_train_df

Unnamed: 0,id,reviewNum,IMPACT,SUBSTANCE,APPROPRIATENESS,MEANINGFUL_COMPARISON,PRESENTATION_FORMAT,comments,SOUNDNESS_CORRECTNESS,ORIGINALITY,is_meta_review,RECOMMENDATION,CLARITY,REVIEWER_CONFIDENCE
0,178,1,3,3,4,3,Poster,The paper describes an extension of word embed...,4,3,False,2,4,3
1,178,2,3,4,5,3,Poster,Summary: This paper presents a model for embed...,3,2,False,2,4,4
2,178,3,3,4,5,3,Poster,The authors presents a method to jointly embed...,3,3,False,2,3,4
3,384,1,3,2,2,3,Poster,- Strengths:\n - the model if theoretically so...,5,5,False,3,4,5
4,384,2,3,3,5,3,Poster,- Strengths:\n\nThis paper presents an approac...,5,5,False,3,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,67,1,3,1,5,4,Poster,- Strengths:\n- The paper tackles an important...,5,3,False,2,5,5
244,67,2,3,4,5,4,Poster,"- Strengths:\n\n * Knowledge lean, language-i...",5,3,False,2,2,4
245,333,1,3,4,5,3,Oral Presentation,- Strengths:\n\nThe authors propose a selectiv...,5,5,False,4,5,3
246,333,2,3,4,5,3,Oral Presentation,- Strengths:\n\nThe paper is very clear and we...,5,5,False,4,5,4
