1° Progetto Social Computing

Let's import the needed libraries, read the .csv file and set the SERP API key 

In [22]:
import pandas as pd
import networkx as nx
import pyvis as pv
import json
import numpy as np
from serpapi import GoogleScholarSearch

GoogleScholarSearch.SERP_API_KEY = 'ca2e08520035a8eb9bd084ee45f9fe6eb2dd733c6eab0bb596e2f9546fe0d6eb' #Denis SERP KEY
df = pd.read_csv("data\\nodes.csv")  #read nodes.csv

We are going to calculate the queries looking for the requested people, for each row in the dataframe.
We will save the queries into 'result' and the affiliations into 'aff'

In [23]:
search = []
result = []
aff = [] #save the affiliations from nodes.csv
name_authors = []
name_coauthors_per_author = [] #will be used later, for now we store the authors names
for index, row in df.iterrows():
    params = {"engine": "google_scholar_profiles", "hl": 'en', "mauthors": row['name']}
    search.append(GoogleScholarSearch(params))  #obtain the queries of all the authors
    result.append(search[index].get_dict()) #transform the json into a dictionary for each query
    aff.append(row['affiliations'])
    name_authors.append(row['name'])
    name_coauthors_per_author.append([row['name']])

Now we need to find, for every .json file we stored into 'results' (1°for loop), for each profile inside the analyzed json (2°for loop), the profile we were looking for (using the affiliation) and store all the information regarding the author_id, cited_by and the interests.

In [None]:
author = []
cited = []
interests = []
for index_result, value_result in enumerate(result): #for each json I retrieve the data I need
    for value_profiles in result[index_result]['profiles']: #for each profile inside of a single json
        interests_per_person = [] #list of interests for every person; clean the list at every iteration
        if (value_profiles['affiliations']) == aff[index_result]: #if it's the profile im looking for, 
                                                                  #then save save author_id, cited_by & interests
            author.append(value_profiles['author_id'])
            cited.append(value_profiles['cited_by'])

            for i in value_profiles['interests']: #interests can be multiple for every person, so let's group them by person
                interests_per_person.append(i['title'])
            
            interests.append(interests_per_person)

Let's update the nodes.csv file with new columns with data retrieved from the queries

In [None]:
df['author_id'] = author
df['cited_by'] = cited
df['interests'] = interests
df.to_csv("data\\nodes.csv", index = False)

Let's retrieve the list of the coauthors for each person in nodes.csv

In [None]:
search = []
result = []
coauthors = []
for index, value in enumerate(author):
    params = {"engine": "google_scholar_author", "hl": 'en', "author_id": value} #searching with the author_id
    search.append(GoogleScholarSearch(params))
    result.append(search[index].get_dict())
    coauthors.append(result[index]['co_authors']) #list of coauthors for every author

For each of the 7 authors, we get into their profiles (via id) and download coauthors names, grouped by author 

In [None]:
for index_author, value_author in enumerate(coauthors): #for each author
    coauthors_names = []
    for index_coauthor, value_coauthor in enumerate(coauthors[index_author]): #for each coauthor of the selected author
        coauthors_names.append(value_coauthor['name']) #save the name of the coauthor

    name_coauthors_per_author[index_author] = coauthors_names #list of coauthor grouped by authors
    

We use the names of the coauthors to look for researchers matching that name.

In [None]:
search = []
result = []
for index, value in enumerate(name_coauthors_per_author): #for each coauthor list of an author
    for i, v in enumerate(value): #for each coauthor of a coauthor list
        params = {"engine": "google_scholar_profiles", "hl": 'en', "mauthors": v}
        search.append(GoogleScholarSearch(params))
        result.append(search[i].get_dict())

With the list of all the coauthors, we select for each coauthor the first result (with index 0) and take all the information requested.

In [None]:
author = []
name = []
affiliations = []
cited = []
interests = []
for index_result, value_result in enumerate(result): #for each json I retrieve the data I need
        interests_per_person = [] #list of interests for every person; clean the list at every iteration
        
        author.append(value_result['profiles'][0]['author_id']) #take the first profile appearing on the query result
        name.append(value_result['profiles'][0]['name'])
        affiliations.append(value_result['profiles'][0]['affiliations'])
        cited.append(value_result['profiles'][0]['cited_by'])

        if('interests' in value_result['profiles'][0]): #check if the coauthor actually has any interests
            for i in value_result['profiles'][0]['interests']: #for every interests, put the name into a list
                interests_per_person.append(i['title'])
        
        interests.append(interests_per_person) #list ehere each item is a list of interests of the specific coauthor

We create a dataframe of coauthors through a dictionary

In [None]:
data = {
    'name': name,
    'affiliations': affiliations,
    'author_id': author,
    'cited_by': cited,
    'interests': interests
}

df2 = pd.DataFrame(data)
df2 = df2.drop_duplicates(subset='author_id') #delete the duplicates

We concatenate authors and coauthors in the same dataframe

In [None]:
df1 = pd.read_csv("data\\nodes.csv")
df = pd.concat([df1, df2])
df = df.drop_duplicates(subset='author_id')
df.to_csv("data\\nodes.csv", index = False) #overwrite nodes.csv with authors and coauthors combined

We create a new dataframe representing the edges between author-coauthor, both ways

In [None]:
author1 = [] #1st column dataframe
author2 = [] #2nd column dataframe

for index_author, value in enumerate(name_coauthors_per_author): #for every author, analyze the coauthor list
    for coauthor in value: #for every coauthor in the coauthor list
        author1.append(name_authors[index_author])
        author2.append(coauthor)

data = {
    'author1': author1,
    'author2': author2
}
df3 = pd.DataFrame(data) #create a new dataframe for the edges
df3.to_csv("data\\edges.csv", index = False)