In [1]:
from itertools import combinations

import numpy as np
import pandas as pd

from genderize import Genderize


In [2]:
# read in parsed data in csv format
df = pd.read_csv('dblp_article.csv', sep=';')
df

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,author,author-aux,author-orcid,booktitle,cdate,cdrom,cite,cite-label,crossref,...,publisher,publtype,sub,sup,title,title-bibtex,tt,url,volume,year
0,91701,,,,,,,,,,...,,informal,,,(error),,,,,
1,91702,,,,,,,,,,...,,informal,,,(was never published),,,,,
2,91703,,,,,,,,,,...,,informal,,,…,,,,,
3,2480033,Frank Manola,,,,,,,,,...,,informal,,,Object Data Language Facilities for Multimedia...,,,db/journals/gtelab/index.html#TR-0169-12-91-165,TR-0169-12-91-165,1991.0
4,2480034,Alejandro P. Buchmann|Frank Manola|Mark F. Hor...,,,,,,,,,...,,informal,,,Object Data Model Facilities for Multimedia Da...,,,db/journals/gtelab/index.html#TM-0332-11-90-165,TM-0332-11-90-165,1990.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2137507,7186397,C. J. Date|E. F. Codd,,,,,,,,,...,,,,,Interactive Support for Non-Programmers: The R...,,,,RJ1400,1974.0
2137508,7186398,Patrick A. V. Hall,,,,,,,,,...,,,,,Common Subexpression Identification in General...,,,,,1974.0
2137509,7186399,Hugo Hellebrand|Markus Casper|Ralf Merz|Rita Ley,,0000-0002-1163-8988,,,,,,,...,,,,,Catchment classification by runoff behaviour w...,,,,15,2011.0
2137510,7186400,E. F. Codd,,,,,ibmTR/rj987.pdf,,,,...,,informal,,,Relational Completeness of Data Base Sublangua...,,,,RJ987,1972.0


In [7]:
# only take four columns, drop nan, reset index, split authors to list
df_parsed = df[['id', 'author', 'title', 'year']]
df_parsed.dropna(inplace=True)
df_parsed.reset_index(drop=True, inplace=True)
df_parsed['year'] = df_parsed['year'].astype('int64')
df_parsed['author'] = df_parsed['author'].str.split('|')
df_parsed

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,id,author,title,year
0,2480033,[Frank Manola],Object Data Language Facilities for Multimedia...,1991
1,2480034,"[Alejandro P. Buchmann, Frank Manola, Mark F. ...",Object Data Model Facilities for Multimedia Da...,1990
2,2480035,[Frank Manola],An Evaluation of Object-Oriented DBMS Developm...,1994
3,2480036,"[Farshad Nayeri, Joe D. Morrison, Mark F. Horn...","Integrating Heterogeneous, Autonomous, Distrib...",1991
4,2480037,[Frank Manola],Object Model Capabilities For Distributed Obje...,1989
...,...,...,...,...
2122143,7186397,"[C. J. Date, E. F. Codd]",Interactive Support for Non-Programmers: The R...,1974
2122144,7186398,[Patrick A. V. Hall],Common Subexpression Identification in General...,1974
2122145,7186399,"[Hugo Hellebrand, Markus Casper, Ralf Merz, Ri...",Catchment classification by runoff behaviour w...,2011
2122146,7186400,[E. F. Codd],Relational Completeness of Data Base Sublangua...,1972


In [8]:
# save to file
df_parsed.to_pickle('dblp_article_filtered.pkl')

In [None]:
# filter out single author articles
articles_multi = [df_parsed['author'].map(lambda x: len(x)) > 1]
articles_multi

In [None]:
# save to file
articles_multi.to_pickle('dblp_article_multi_author.pkl')

In [None]:
# expend the author list 
df_expand = pd.DataFrame([(author, tup.id, position, tup.year) for tup in articles_multi.itertuples() for position, author in enumerate(tup.author)],
                         columns=['author', 'publication', 'position', 'year'])

df_expand

In [None]:
# sort authors by number of publications, convert to DataFrame
unique_author, pub_count = np.unique(df_expand['author'].tolist(), return_counts=True)
author_count_map = dict(zip(unique_author, pub_count))
top_author = sorted(author_count_map.items(), key=lambda x:x[1], reverse=True)
df_authors = pd.DataFrame(top_author, columns=['author', 'number_publication'])
df_authors

In [None]:
# save to file
df_authors.to_pickle('authors.pkl')

In [None]:
# generate author pairs, convert to edge list
articles_multi['pairs'] = articles_multi['author'].map(lambda x: list(combinations(x,2)))
edge_list = pd.DataFrame([(pair[0],pair[1], tup.title, tup.year) for tup in articles_multi.itertuples() for pair in tup.pairs], columns=['source', 'target', 'title', 'year'])
edge_list

In [None]:
# save to file
edge_list.to_csv('dblp_article_duplicate_edge_list.csv', index=None)

In [None]:
# get a list of unique first name
unique_first_name_source = edge_list['source'].map(lambda x: x.split(' ')[0]).unique()
unique_first_name_target = edge_list['target'].map(lambda x: x.split(' ')[0]).unique()
unique_first_name = np.concatenate((unique_first_name_source, unique_first_name_target), axis=None)

In [None]:
# init a Genderize instance, get the list of genders
gender = Genderize()
gender_list = gender.get(unique_first_name)
df_gender = pd.DataFrame(gender_list)
df_gender.to_csv('gender_list.csv', index=None)

