In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
file = 'arxiv_data_cs_all.csv.bz2'

In [3]:
df = pd.read_csv(file, nrows=50000)
df = df.astype(object).where(pd.notnull(df), None)
df

Unnamed: 0,id,updated,published,title,summary,authors,affiliations,doi,journal_ref,pdf_link,primary_category,categories
0,http://arxiv.org/abs/1906.02739v1,2019-06-06T17:56:09Z,2019-06-06T17:56:09Z,Mesh R-CNN,Rapid advances in 2D perception have led to sy...,Georgia Gkioxari|Jitendra Malik|Justin Johnson,,,,http://arxiv.org/pdf/1906.02739v1,cs.CV,cs.CV
1,http://arxiv.org/abs/1906.02738v1,2019-06-06T17:55:37Z,2019-06-06T17:55:37Z,Conversing by Reading: Contentful Neural Conve...,Although neural conversation models are effect...,Lianhui Qin|Michel Galley|Chris Brockett|Xiaod...,,,,http://arxiv.org/pdf/1906.02738v1,cs.CL,cs.CL|cs.AI|cs.LG
2,http://arxiv.org/abs/1906.02736v1,2019-06-06T17:55:17Z,2019-06-06T17:55:17Z,DeepMDP: Learning Continuous Latent Space Mode...,Many reinforcement learning RL) tasks provide ...,Carles Gelada|Saurabh Kumar|Jacob Buckman|Ofir...,,,,http://arxiv.org/pdf/1906.02736v1,cs.LG,cs.LG|stat.ML
3,http://arxiv.org/abs/1906.02735v1,2019-06-06T17:55:01Z,2019-06-06T17:55:01Z,Residual Flows for Invertible Generative Modeling,Flow-based generative models parameterize prob...,Ricky T. Q. Chen|Jens Behrmann|David Duvenaud|...,,,,http://arxiv.org/pdf/1906.02735v1,stat.ML,stat.ML|cs.LG
4,http://arxiv.org/abs/1906.02611v1,2019-06-06T17:54:24Z,2019-06-06T17:54:24Z,Improving Robustness Without Sacrificing Accur...,Deploying machine learning systems in the real...,Raphael Gontijo Lopes|Dong Yin|Ben Poole|Justi...,,,,http://arxiv.org/pdf/1906.02611v1,cs.LG,cs.LG|cs.CV|stat.ML
5,http://arxiv.org/abs/1906.02732v1,2019-06-06T17:51:51Z,2019-06-06T17:51:51Z,A Look at the Effect of Sample Design on Gener...,This paper provides a general framework to stu...,Bhavya Kailkhura|Jayaraman J. Thiagarajan|Qunw...,,,,http://arxiv.org/pdf/1906.02732v1,cs.LG,cs.LG|stat.ML
6,http://arxiv.org/abs/1906.02729v1,2019-06-06T17:50:48Z,2019-06-06T17:50:48Z,3D-RelNet: Joint Object and Relational Network...,We propose an approach to predict the 3D shape...,Nilesh Kulkarni|Ishan Misra|Shubham Tulsiani|A...,,,,http://arxiv.org/pdf/1906.02729v1,cs.CV,cs.CV
7,http://arxiv.org/abs/1906.02728v1,2019-06-06T17:49:41Z,2019-06-06T17:49:41Z,Feature-level and Model-level Audiovisual Fusi...,Emotion recognition plays an important role in...,Jie Cai|Zibo Meng|Ahmed Shehab Khan|Zhiyuan Li...,,,,http://arxiv.org/pdf/1906.02728v1,cs.CV,cs.CV
8,http://arxiv.org/abs/1906.02719v1,2019-06-06T17:39:48Z,2019-06-06T17:39:48Z,Learning Gaussian Graphical Models with Ordere...,We address the task of identifying densely con...,Cody Mazza-Anthony|Bogdan Mazoure|Mark Coates,,,,http://arxiv.org/pdf/1906.02719v1,stat.ML,stat.ML|cs.LG
9,http://arxiv.org/abs/1906.02717v1,2019-06-06T17:36:34Z,2019-06-06T17:36:34Z,Adaptive Gradient-Based Meta-Learning Methods,We build a theoretical framework for understan...,Mikhail Khodak|Maria Florina-Balcan|Ameet Talw...,,,,http://arxiv.org/pdf/1906.02717v1,cs.LG,cs.LG|cs.AI|stat.ML


In [4]:
df_publications = []
df_categories = {}
df_authors = {}
df_pub_categories = []
df_pub_prim_categories = []
df_pub_authors = []

pattern = re.compile(r"^([A-Za-z\.-])+$")

def check_category(cat):
    return pattern.match(cat)

for index, row in df.iterrows():
    df_publications.append([row['id'], row['title'], row['summary'], row['pdf_link'],
                            row['categories'].replace('|', ','), row['authors'].replace('|', ',')])

    if check_category(row['primary_category']):
        p_cat_id = df_categories.get(row['primary_category'], None)
        if p_cat_id is None:
            p_cat_id = len(df_categories)+1
            df_categories[row['primary_category']] = p_cat_id
        df_pub_prim_categories.append([row['id'], p_cat_id])

    if row['categories']:

        for cat in row['categories'].split('|'):
            if check_category(cat):
                cat_id = df_categories.get(cat, None)
                if cat_id is None:
                    cat_id = len(df_categories)+1
                    df_categories[cat] = cat_id
                df_pub_categories.append([row['id'], cat_id])

    if row['authors']:
        for aut in row['authors'].split('|'):
            aut_id = df_authors.get(aut, None)
            if aut_id is None:
                aut_id = len(df_authors)+1
                df_authors[aut] = aut_id
            df_pub_authors.append([row['id'], aut_id])

df_publications = pd.DataFrame(df_publications, columns=['id', 'title', 'summary', 'pdf_link', 'categories', 'authors'])
df_categories = pd.DataFrame([[k, v] for v, k in df_categories.items()], columns=['id', 'category'])
df_authors = pd.DataFrame([[k, v] for v, k in df_authors.items()], columns=['id', 'author'])
df_pub_categories = pd.DataFrame(df_pub_categories, columns=['publication_id', 'category_id'])
df_pub_prim_categories = pd.DataFrame(df_pub_prim_categories, columns=['publication_id', 'category_id'])
df_pub_authors = pd.DataFrame(df_pub_authors, columns=['publication_id', 'author_id'])


In [None]:
df_publications = df_publications.replace(r'\\','', regex=True)
df_publications = df_publications.replace({'"': "'"})
df_publications.to_csv('./output/publications.csv', mode='w+', index=False)
df_publications

Unnamed: 0,id,title,summary,pdf_link,categories,authors
0,http://arxiv.org/abs/1906.02739v1,Mesh R-CNN,Rapid advances in 2D perception have led to sy...,http://arxiv.org/pdf/1906.02739v1,cs.CV,"Georgia Gkioxari,Jitendra Malik,Justin Johnson"
1,http://arxiv.org/abs/1906.02738v1,Conversing by Reading: Contentful Neural Conve...,Although neural conversation models are effect...,http://arxiv.org/pdf/1906.02738v1,"cs.CL,cs.AI,cs.LG","Lianhui Qin,Michel Galley,Chris Brockett,Xiaod..."
2,http://arxiv.org/abs/1906.02736v1,DeepMDP: Learning Continuous Latent Space Mode...,Many reinforcement learning RL) tasks provide ...,http://arxiv.org/pdf/1906.02736v1,"cs.LG,stat.ML","Carles Gelada,Saurabh Kumar,Jacob Buckman,Ofir..."
3,http://arxiv.org/abs/1906.02735v1,Residual Flows for Invertible Generative Modeling,Flow-based generative models parameterize prob...,http://arxiv.org/pdf/1906.02735v1,"stat.ML,cs.LG","Ricky T. Q. Chen,Jens Behrmann,David Duvenaud,..."
4,http://arxiv.org/abs/1906.02611v1,Improving Robustness Without Sacrificing Accur...,Deploying machine learning systems in the real...,http://arxiv.org/pdf/1906.02611v1,"cs.LG,cs.CV,stat.ML","Raphael Gontijo Lopes,Dong Yin,Ben Poole,Justi..."
5,http://arxiv.org/abs/1906.02732v1,A Look at the Effect of Sample Design on Gener...,This paper provides a general framework to stu...,http://arxiv.org/pdf/1906.02732v1,"cs.LG,stat.ML","Bhavya Kailkhura,Jayaraman J. Thiagarajan,Qunw..."
6,http://arxiv.org/abs/1906.02729v1,3D-RelNet: Joint Object and Relational Network...,We propose an approach to predict the 3D shape...,http://arxiv.org/pdf/1906.02729v1,cs.CV,"Nilesh Kulkarni,Ishan Misra,Shubham Tulsiani,A..."
7,http://arxiv.org/abs/1906.02728v1,Feature-level and Model-level Audiovisual Fusi...,Emotion recognition plays an important role in...,http://arxiv.org/pdf/1906.02728v1,cs.CV,"Jie Cai,Zibo Meng,Ahmed Shehab Khan,Zhiyuan Li..."
8,http://arxiv.org/abs/1906.02719v1,Learning Gaussian Graphical Models with Ordere...,We address the task of identifying densely con...,http://arxiv.org/pdf/1906.02719v1,"stat.ML,cs.LG","Cody Mazza-Anthony,Bogdan Mazoure,Mark Coates"
9,http://arxiv.org/abs/1906.02717v1,Adaptive Gradient-Based Meta-Learning Methods,We build a theoretical framework for understan...,http://arxiv.org/pdf/1906.02717v1,"cs.LG,cs.AI,stat.ML","Mikhail Khodak,Maria Florina-Balcan,Ameet Talw..."


In [None]:
df_categories.to_csv('./output/categories.csv', mode='w+', index=False)
df_categories

Unnamed: 0,id,category
0,1,cs.CV
1,2,cs.CL
2,3,cs.AI
3,4,cs.LG
4,5,stat.ML
5,6,math.OC
6,7,cs.IT
7,8,math.IT
8,9,quant-ph
9,10,cs.SI


In [None]:
df_pub_categories.to_csv('./output/publications_categories.csv', mode='w+', index=False)
df_pub_categories

Unnamed: 0,publication_id,category_id
0,http://arxiv.org/abs/1906.02739v1,1
1,http://arxiv.org/abs/1906.02738v1,2
2,http://arxiv.org/abs/1906.02738v1,3
3,http://arxiv.org/abs/1906.02738v1,4
4,http://arxiv.org/abs/1906.02736v1,4
5,http://arxiv.org/abs/1906.02736v1,5
6,http://arxiv.org/abs/1906.02735v1,5
7,http://arxiv.org/abs/1906.02735v1,4
8,http://arxiv.org/abs/1906.02611v1,4
9,http://arxiv.org/abs/1906.02611v1,1


In [None]:
df_pub_prim_categories.to_csv('./output/publications_primary_categories.csv', mode='w+', index=False)
df_pub_prim_categories

Unnamed: 0,publication_id,category_id
0,http://arxiv.org/abs/1906.02739v1,1
1,http://arxiv.org/abs/1906.02738v1,2
2,http://arxiv.org/abs/1906.02736v1,4
3,http://arxiv.org/abs/1906.02735v1,5
4,http://arxiv.org/abs/1906.02611v1,4
5,http://arxiv.org/abs/1906.02732v1,4
6,http://arxiv.org/abs/1906.02729v1,1
7,http://arxiv.org/abs/1906.02728v1,1
8,http://arxiv.org/abs/1906.02719v1,5
9,http://arxiv.org/abs/1906.02717v1,4


In [None]:
df_authors.to_csv('./output/authors.csv', mode='w+', index=False)
df_authors

Unnamed: 0,id,author
0,1,Georgia Gkioxari
1,2,Jitendra Malik
2,3,Justin Johnson
3,4,Lianhui Qin
4,5,Michel Galley
5,6,Chris Brockett
6,7,Xiaodong Liu
7,8,Xiang Gao
8,9,Bill Dolan
9,10,Yejin Choi


In [None]:
df_pub_authors.to_csv('./output/publications_authors.csv', mode='w+', index=False)
df_pub_authors

In [None]:
df_affiliations.to_csv('./output/affiliations.csv', mode='w+', index=False)
df_affiliations