# Libraries

In [1]:
# Google Scholar
# https://scholar.google.com/citations?user=sKSTKAoAAAAJ&hl=en
# https://scholar.google.com/citations?view_op=view_org&hl=en&org=1896398670060433590

import pickle
import numpy as np
import pandas as pd

from scholarly import scholarly
# https://github.com/scholarly-python-package/scholarly
# https://www.scraperapi.com/blog/best-google-scholar-apis-proxies/
from scholarly import ProxyGenerator

from random import randint
from time import sleep

from collections import Counter
from itertools import combinations

import json


# Import Datasets

In [2]:
df_listFaculty = pd.read_csv('df_listFaculty.csv')
df_listFaculty


Unnamed: 0,ID,Title,Area,Scholar ID
0,1762,Amir Yacoby,Applied Physics,slFLfLYAAAAJ
1,2626,Ann Pearson,Environmental Science & Engineering,Hwk4sP0AAAAJ
2,1795,Ariel Amir,Applied Mathematics,
3,13261,Ariel Procaccia,Computer Science,8ZpV-lkAAAAJ
4,1523,Barbara J. Grosz,Computer Science,
...,...,...,...,...
120,1713,Yaron Singer,Computer Science,j-MBXNMAAAAJ
121,1519,Yiling Chen,Computer Science,x_7xA0UAAAAJ
122,1651,Yue Lu,Electrical Engineering,84FUioAAAAAJ
123,1553,Zhigang Suo,Materials Science & Mechanical Engineering,GlBl-VUAAAAJ


In [3]:
df_listIncorrectCollabs = pd.read_csv('df_listIncorrectCollabs.csv')
df_listIncorrectCollabs


Unnamed: 0,Title,Incorrect Collaboration
0,Ann Pearson,Daniel P. Schrag
1,Ariel Procaccia,Finale Doshi-Velez
2,Cynthia Friend,Efthimios Kaxiras
3,Cynthia Friend,Eric Mazur
4,Cynthia Friend,Scot T. Martin
...,...,...
60,Michael J. Aziz,Joanna Aizenberg
61,Philip Kim,Radhika Nagpal
62,Steven C. Wofsy,David A. Weitz
63,Steven C. Wofsy,Donald Elliot Ingber


In [4]:
with open('output_scholarly.pkl', 'rb') as input:
    list_scholarly = pickle.load(input)
    

# Generate Collaboration Data

### Create dictionary map between faculty name and Google Scholar ID

In [5]:
map_facultyID = dict(zip(df_listFaculty['Scholar ID'], df_listFaculty['Title']))
map_facultyID


{'slFLfLYAAAAJ': 'Amir Yacoby',
 'Hwk4sP0AAAAJ': 'Ann Pearson',
 'None': 'Zhiming Kuang',
 '8ZpV-lkAAAAJ': 'Ariel Procaccia',
 'I0fbJ6cAAAAJ': 'Boaz Barak',
 '9I4ltOIAAAAJ': 'Boris Kozinsky',
 'fABXPQQAAAAJ': 'Brian F. Farrell',
 'veDLTPEAAAAJ': 'Cengiz Pehlevan',
 '-XVcP5QAAAAJ': 'Christin Y. Sander',
 'IS_xUuIAAAAJ': 'Christopher Rycroft',
 'sKSTKAoAAAAJ': 'Conor J Walsh',
 'pxeyQ_QAAAAJ': 'Cynthia Dwork',
 'qmEw4LoAAAAJ': 'Cynthia Friend',
 '5C1bJKkAAAAJ': 'Daniel J. Jacob',
 'fKQ4QLYAAAAJ': 'Daniel P. Schrag',
 'R1cfTCoAAAAJ': 'David A. Weitz',
 'vXHA_XYAAAAJ': 'David Brooks',
 'tCCmmgQAAAAJ': 'David C. Bell',
 'wC0M5uoAAAAJ': 'David Clarke',
 'PfciJkgAAAAJ': 'David Keith',
 'ZflJqeUAAAAJ': 'David Mooney',
 'JUn8PgwAAAAJ': 'David Parkes',
 'jGboazkAAAAJ': 'David R. Nelson',
 'qHiACEgAAAAJ': 'Demba Ba',
 'UEgdNEoAAAAJ': 'Doeke Romke Hekstra',
 '3hzhsK4AAAAJ': 'Donald Elliot Ingber',
 'VMihW8oAAAAJ': 'Eddie Kohler',
 's9cO1fUAAAAJ': 'Efthimios Kaxiras',
 'C_r8d0AAAAAJ': 'Elena Leah G

### Create faculty links based on shared publications

In [6]:
dict_pubs = {}

for i in range(len(list_scholarly)):
    if list_scholarly[i] != 'None':
        author = map_facultyID[list_scholarly[i]['scholar_id']]
        list_pubs = list_scholarly[i]['publications']
        num_pubs = len(list_pubs)
        
        for i in range(num_pubs):
            pub_title = list_pubs[i]['bib']['title']
            
            if pub_title in dict_pubs and author not in dict_pubs[pub_title]:
                dict_pubs[pub_title].append(author)
            else:
                dict_pubs[pub_title] = [author]
                

In [7]:
dict_pubs_filter1 = {key:value for key,value in dict_pubs.items() if len(value) > 1}
print(len(dict_pubs_filter1))
print(list(dict_pubs_filter1.items())[:20])


1141
[('A robust scanning diamond sensor for nanoscale imaging with single nitrogen-vacancy centres', ['Amir Yacoby', 'Marko Loncar']), ('Integrated diamond networks for quantum nanophotonics', ['Amir Yacoby', 'Marko Loncar']), ('Enhanced single-photon emission from a diamond–silver aperture', ['Amir Yacoby', 'Marko Loncar']), ('Coherent optical transitions in implanted nitrogen vacancy centers', ['Amir Yacoby', 'Marko Loncar']), ('Inducing superconducting correlation in quantum Hall edge states', ['Amir Yacoby', 'Philip Kim']), ('Single-color centers implanted in diamond nanostructures', ['Amir Yacoby', 'Marko Loncar']), ('Imaging viscous flow of the Dirac fluid in graphene', ['Amir Yacoby', 'Philip Kim']), ('Atomically precise, custom-design origami graphene nanostructures', ['Amir Yacoby', 'Philip Kim']), ('Synthetic diamond materials for quantum and optical applications and methods of making the same', ['Amir Yacoby', 'Marko Loncar']), ('A robust, scanning quantum system for nanosc

In [8]:
authors_2 = Counter()

for authors in list(dict_pubs_filter1.values()):
    authors.sort()
    for comb in combinations(authors, 2):
        authors_2[comb] += 1

authors_2.most_common()[:20]


[(('David Brooks', 'Gu-Yeon Wei'), 153),
 (('Daniel J. Jacob', 'Steven C. Wofsy'), 60),
 (('Federico Capasso', 'Marko Loncar'), 55),
 (('Conor J Walsh', 'Robert J. Wood'), 50),
 (('Joost J. Vlassak', 'Zhigang Suo'), 46),
 (('Cynthia Friend', 'Efthimios Kaxiras'), 44),
 (('Eric Mazur', 'Marko Loncar'), 30),
 (('Daniel J. Jacob', 'Elsie M Sunderland'), 28),
 (('David Brooks', 'Vijay Janapa Reddi'), 24),
 (('Donald Elliot Ingber', 'Kit Parker'), 23),
 (('Jerry X. Mitrovica', 'Peter John Huybers'), 23),
 (('Gu-Yeon Wei', 'Vijay Janapa Reddi'), 21),
 (('Amir Yacoby', 'Philip Kim'), 19),
 (('Radhika Nagpal', 'Robert J. Wood'), 19),
 (('David Mooney', 'Donald Elliot Ingber'), 19),
 (('Conor J Walsh', 'David Mooney'), 18),
 (('Joanna Aizenberg', 'Marko Loncar'), 16),
 (('Gu-Yeon Wei', 'Robert J. Wood'), 14),
 (('David Clarke', 'Robert J. Wood'), 14),
 (('Joanna Aizenberg', 'Katia Bertoldi'), 14)]

In [35]:
list_faculty = list(df_listFaculty['Title'])

for i in range(len(df_listIncorrectCollabs)):
    author1 = df_listIncorrectCollabs['Title'][i]
    if author1 not in list_faculty:
        print('TYPO: ', author1)
    
    author2 = df_listIncorrectCollabs['Incorrect Collaboration'][i]
    if author2 not in list_faculty:
        print('TYPO: ', author2)
        

In [29]:
list_pairs = []
for i in range(len(df_listIncorrectCollabs)):
    author1 = df_listIncorrectCollabs['Title'][i]
    author2 = df_listIncorrectCollabs['Incorrect Collaboration'][i]
    pair = [author1, author2]
    pair.sort()
    list_pairs.append(pair)
list_pairs


[['Ann Pearson', 'Daniel P. Schrag'],
 ['Ariel Procaccia', 'Finale Doshi-Velez'],
 ['Cynthia Friend', 'Efthimios Kaxiras'],
 ['Cynthia Friend', 'Eric Mazur'],
 ['Cynthia Friend', 'Scot T. Martin'],
 ['Cynthia Friend', 'Joost J. Vlassak'],
 ['Cynthia Friend', 'Michael J. Aziz'],
 ['Ann Pearson', 'Daniel P. Schrag'],
 ['Daniel J. Jacob', 'Daniel P. Schrag'],
 ['Daniel P. Schrag', 'Steven C. Wofsy'],
 ['David Parkes', 'Milind Tambe'],
 ['David Parkes', 'Stuart M. Shieber'],
 ['David Parkes', 'Finale Doshi-Velez'],
 ['Doeke Romke Hekstra', 'Efthimios Kaxiras'],
 ['Amir Yacoby', 'Doeke Romke Hekstra'],
 ['David A. Weitz', 'Doeke Romke Hekstra'],
 ['Doeke Romke Hekstra', 'Jennifer Lewis'],
 ['Doeke Romke Hekstra', 'Kit Parker'],
 ['Cynthia Friend', 'Efthimios Kaxiras'],
 ['Efthimios Kaxiras', 'Joost J. Vlassak'],
 ['Efthimios Kaxiras', 'Zhigang Suo'],
 ['Efthimios Kaxiras', 'Hanspeter Pfister'],
 ['Boris Kozinsky', 'Efthimios Kaxiras'],
 ['Efthimios Kaxiras', 'Joanna Aizenberg'],
 ['Efthimio

In [50]:
list_bothMarked = []
list_numJointPubs = []
list_googleScholar = []

for pair in list_pairs:
    if list_pairs.count(pair) > 1:
        list_bothMarked.append(True)
    else:
        list_bothMarked.append(False)
    
    numJointPubs = authors_2[(pair[0], pair[1])]
    list_numJointPubs.append(numJointPubs)
    
    string_googleScholar = 'https://scholar.google.com/scholar?q='
    faculty1 = pair[0].split()[0]+'+'+pair[0].split()[-1]
    faculty2 = pair[1].split()[0]+'+'+pair[1].split()[-1]
    string_googleScholar = string_googleScholar+faculty1+'+'+faculty2+'+Harvard'
    list_googleScholar.append(string_googleScholar)


In [55]:
df_listIncorrectCollabs['Both Marked Incorrect'] = list_bothMarked
df_listIncorrectCollabs['Number of Joint Publications'] = list_numJointPubs
df_listIncorrectCollabs['Google Scholar'] = list_googleScholar

df_listIncorrectCollabs


Unnamed: 0,Title,Incorrect Collaboration,Both Marked Incorrect,Number of Joint Publications,Google Scholar
0,Ann Pearson,Daniel P. Schrag,True,2,https://scholar.google.com/scholar?q=Ann+Pears...
1,Ariel Procaccia,Finale Doshi-Velez,False,1,https://scholar.google.com/scholar?q=Ariel+Pro...
2,Cynthia Friend,Efthimios Kaxiras,True,44,https://scholar.google.com/scholar?q=Cynthia+F...
3,Cynthia Friend,Eric Mazur,False,6,https://scholar.google.com/scholar?q=Cynthia+F...
4,Cynthia Friend,Scot T. Martin,False,3,https://scholar.google.com/scholar?q=Cynthia+F...
...,...,...,...,...,...
60,Michael J. Aziz,Joanna Aizenberg,True,1,https://scholar.google.com/scholar?q=Joanna+Ai...
61,Philip Kim,Radhika Nagpal,False,1,https://scholar.google.com/scholar?q=Philip+Ki...
62,Steven C. Wofsy,David A. Weitz,False,1,https://scholar.google.com/scholar?q=David+Wei...
63,Steven C. Wofsy,Donald Elliot Ingber,False,1,https://scholar.google.com/scholar?q=Donald+In...


In [56]:
df_listIncorrectCollabs.to_csv('list_incorrectCollabs.csv', index=False)
