In [1]:
import numpy as np
import os
import pandas as pd
import json
import networkx as nx
from tqdm import tqdm
from itertools import chain
from collections import Counter
import matplotlib.pyplot as plt
import plotly.express as px
import community as community_louvain

import warnings
warnings.filterwarnings("ignore")

def flatten_chain(matrix):
     return list(chain.from_iterable(matrix))

def append_row(df, row):
    return pd.concat([
                df, 
                pd.DataFrame([row], columns=row.index)]
           ).reset_index(drop=True)

def div_dict(my_dict):
    sum_p = sum(my_dict.values())
    for i in my_dict:
        my_dict[i] = float(my_dict[i]/sum_p)
    return my_dict 

def dict_max(my_dict,edge_val,else_string):
    maxval = np.max(list(my_dict.values()))
    maxkey = max(my_dict, key=my_dict.get)
    if maxval > edge_val:
        return(maxkey)
    else:
        return(else_string)
    
from nltk.corpus import stopwords

s = set(stopwords.words('english'))

folder = './scopus_publication_files_16042024'
files = os.listdir(folder)
file = files[0]

author_folder = './scopus_author_files_16042024'
author_files = os.listdir(author_folder)

df_subjectAreas = pd.read_xml('scopus_subject_classification.xml')
df_subjectAreas.set_index('code',inplace=True)
df_subject_areas_color = pd.DataFrame({'subject':list(df_subjectAreas.groupby('subject-classification').sum().index), 'color':['#5FBC52','#C1A765','#56B280','#EB704F','#FF8001','#00C400','#FF3300','#9933FF','#BC7BED','#535AE7','#FAA72E','#DFC717','#E94727','#00FF99','#CF63B0','#5B99F3','#4ABFD6','#3DE3D3','#D143E9','#A6A6A6','#CF63B0','#DB53A7','#FF5B9D','#4641FF','#C46DE7','#9358F2','#CCCCFF']})

df_names = pd.read_csv('people_in_S4S_pureFiltered_withAuthorIDs.csv',index_col=[0],keep_default_na=False).set_index('scopusID')

df_keysPerSubject = df_subjectAreas
df_keysPerSubject["count_author"] = [0] * len(df_keysPerSubject)
df_keysPerSubject["count_index"] = [0] * len(df_keysPerSubject)
df_keysPerSubject["count_paper"] = [0] * len(df_keysPerSubject)

In [2]:
authors = []
df_authors = pd.DataFrame(columns=['scopusID', 'firstname','lastname','citationCount','documentCount','publicationRange','current_affiliation_institute','current_affiliation_department','current_affiliation_country','current_affiliation_city','current_affiliation_lat','current_affiliation_lon'])
# df_co_authors = pd.DataFrame(columns=['scopusId', 'firstname','lastname','citationCount','documentCount','publicationRange','current_affiliation_institute','current_affiliation_department','current_affiliation_country','current_affiliation_city','current_affiliation_lat','current_affiliation_lon'])

for file in tqdm(author_files):
    with open(author_folder + '/' + file, encoding='utf-16') as fd:
        author_dict = json.load(fd)
    df_authors = append_row(df_authors, pd.Series(author_dict).rename({'scopusid':'scopusId'}))

df_authors['fullname'] = df_authors['firstname'] + df_authors['lastname']
df_authors.set_index('fullname',inplace=True)

df_authors['keywords'] = [[]]*len(df_authors)


100%|██████████| 144/144 [00:04<00:00, 33.10it/s]


In [178]:
keyword_list = []
author_list = []
author_keywords = {}
df_papers = pd.DataFrame(columns=['eid', 'doi', 'authorCount', 'releaseDate', 'citationCount',
       'authorNames', 'authorScopusIds', 'refCount', 'authorKeywords',
       'idxterms', 'subjectAreasDetail', 'subjectAreas'])

keyword_dict = {}

subject_list = []
for file in tqdm(files):
    with open(folder + '/' +file, encoding='utf16') as fd:
        paper_dict = json.load(fd)
    # del paper_dict['references']
    df_papers = append_row(df_papers, pd.Series(paper_dict))
    
    paper_keywords = [] 
    #filtering keywords for stopwords (e.g. the netherlands vs netherlands)
    filtered_keywords = []
    if 'authorKeywords' in paper_dict.keys():
        for word in paper_dict['authorKeywords']: 
            word = word.lower()
            term = ''
            for word_part in word.split(' '):
                if word_part not in s: 
                    term += word_part + ' '
            term = term.strip()
            filtered_keywords.append(term)

        paper_keywords.append(filtered_keywords)
    

    #index terms
    if 'idxterms' in paper_dict.keys():
        if paper_dict['idxterms'] != None:
            idxterms = paper_dict['idxterms']
            filtered_idxterms = []
            for word in idxterms: 
                word = word.lower()
                if word not in s: 
                    filtered_idxterms.append(word) 
        paper_keywords.append(filtered_idxterms)

    paper_keywords = pd.Series(flatten_chain(paper_keywords))
    paper_keywords = paper_keywords.str.lower()
    paper_keywords.drop_duplicates(inplace=True)


    keyword_list.append(paper_keywords.to_list())
    # if paper_dict['authorNames'] != None:
    #     author_list.append(paper_dict['authorNames'])

    i=0
    author_sublist = []
    if paper_dict['authorNames'] != None:
        for author, scopusId in zip(paper_dict['authorNames'], paper_dict['authorScopusIds']):
            if scopusId in df_names.index.to_list():
                name = df_names['fullname'].loc[scopusId].strip()
                author_sublist.append(name)
                if name not in author_keywords.keys():
                    author_keywords[name] = []
                    author_keywords[name].append(list(paper_keywords))
                else:
                    author_keywords[name].append(list(paper_keywords))


    for key in paper_keywords:

        if key in keyword_dict.keys():
            pass
        else:
            keyword_dict[key] = {}
            keyword_dict[key]['subject_areas_code'] = []
            keyword_dict[key]['subject_areas_detailed'] = []
            keyword_dict[key]['total_citations'] = 0
            keyword_dict[key]['num_of_papers'] = 0
            keyword_dict[key]['authors'] = []
            keyword_dict[key]['scopusIds'] = []
            keyword_dict[key]['type'] = []
            keyword_dict[key]['doi'] = []
            keyword_dict[key]['title'] = []
            keyword_dict[key]['releaseDate'] = []

        keyword_dict[key]['type'].append('author' if i < len(filtered_keywords) else 'index')

        keyword_dict[key]['doi'].append(paper_dict['doi'])
        keyword_dict[key]['title'].append(paper_dict['title'])
        keyword_dict[key]['releaseDate'].append(paper_dict['releaseDate'])

        if 'subjectAreas' in paper_dict.keys():
            for subj in paper_dict['subjectAreas']:
                subj = int(subj)
                subject_list.append(subj)
                keyword_dict[key]['subject_areas_code'].append(subj)
                # keyword_dict[key]['subject_areas_detailed'].append(df_subjectAreas.loc[subj]["detail"])

                df_keysPerSubject["count_author"] .loc[subj]+= len(filtered_keywords)
                df_keysPerSubject["count_index"].loc[subj] += len(filtered_idxterms)
                df_keysPerSubject["count_paper"].loc[subj] += 1


        if paper_dict['authorNames'] != None:
            for author, scopusId in zip(paper_dict['authorNames'],paper_dict['authorScopusIds']):
                if scopusId in df_names.index.to_list():
                    keyword_dict[key]['authors'].append(df_names['fullname'].loc[scopusId].strip())  
                    keyword_dict[key]['scopusIds'].append(scopusId)  
                    # author_sublist.append(df_names['fullname'].loc[scopusId].strip())

        keyword_dict[key]['total_citations'] += int(paper_dict['citationCount']) if paper_dict['citationCount'] != None else 0
        keyword_dict[key]['num_of_papers'] += 1
        i+=1
    author_list.append(author_sublist)
        
keyword_list_flattened = flatten_chain(keyword_list)
df_keywords = pd.DataFrame.from_dict(keyword_dict,orient='index')

df_keywords['authors'] = df_keywords['authors'].apply(lambda x: Counter(x))
df_keywords['author_names'] = df_keywords['authors'].apply(lambda x: list(x.keys()))
df_keywords['num_of_authors'] = df_keywords['authors'].apply(lambda x: len(x))

df_keywords['scopusIds'] = df_keywords['scopusIds'].apply(lambda x: Counter(x))

# df_keywords['authors'] = df_keywords['authors'].apply(lambda x: str(x).replace('{','').replace('}','').replace('Counter','').replace('(','').replace(')','').replace("'",''))
# df_keywords['scopusIds'] = df_keywords['scopusIds'].apply(lambda x: str(x).replace('{','').replace('}','').replace('Counter','').replace('(','').replace(')','').replace("'",''))

# df_keywords['papers_per_author'] = df_keywords['num_of_authors']/df_keywords['num_of_papers']
# df_keywords['relevance'] = df_keywords['num_of_authors']/max(df_keywords['num_of_authors'])   / df_keywords['num_of_papers']/max(df_keywords['num_of_papers'])

# df_keywords['subjects_detail_counted'] = df_keywords['subject_areas_detailed'].apply(lambda x: Counter(x))

df_keywords['type'] = df_keywords['type'].apply(lambda x: Counter(x))


df_keywords['main_type'] = df_keywords["type"].apply(lambda x: div_dict(dict(x)))
df_keywords['main_type'] = df_keywords['main_type'].apply(lambda x: dict_max(x,0.5,'None'))

# df_keywords['type'] = df_keywords['type'].apply(lambda x: str(x).replace('{','').replace('}','').replace('Counter','').replace('(','').replace(')','').replace("'",''))


df_authors['keywords'] = author_keywords



100%|██████████| 2198/2198 [00:39<00:00, 55.56it/s]


In [179]:
df_papers['releaseDate_formatted'] = pd.to_datetime(df_papers['releaseDate'], format='mixed', dayfirst=True)


In [180]:
df_authors['documentCount'] = df_authors['publishedArticles'].apply(lambda x: len(x))

In [181]:
sum(df_authors['documentCount'])

2507

In [19]:
counted_keywords = pd.DataFrame.from_dict(dict(Counter(keyword_list_flattened)),orient='index',columns=['counted'])

keywords_below = counted_keywords[counted_keywords['counted'] < 2].index.to_list()
keyword_list_reduced = keyword_list.copy()

for key in tqdm(keywords_below):
    for i in range(len(keyword_list_reduced)):
        try:
            keyword_list_reduced[i].remove(key)
        except:
            pass
        
keyword_list_reduced_flattened = flatten_chain(keyword_list_reduced)
df_keywords = df_keywords.join(counted_keywords)

  0%|          | 0/8791 [00:00<?, ?it/s]

100%|██████████| 8791/8791 [00:14<00:00, 616.27it/s]


In [20]:
keyword_nodes = list(df_keywords[df_keywords['counted'] >= 2].index)
author_nodes = df_keywords[df_keywords['counted'] >= 2]['authors'].apply(lambda x: list(x.keys()))
authors = []
for i in author_nodes.index:
    for auth in author_nodes.loc[i]:
        if auth in authors:
            pass
        else:
            authors.append(auth)
author_nodes = authors

nodes = keyword_nodes + author_nodes



df_keyword_nodes = pd.DataFrame({'node_id':keyword_nodes, 'type':['keyword']*len(keyword_nodes)})
df_author_nodes = pd.DataFrame({'node_id':author_nodes, 'type':['author']*len(author_nodes)})

df_nodes = pd.concat([df_keyword_nodes,df_author_nodes])
df_nodes.set_index('node_id',inplace=True, drop=False)

In [21]:
adj_matrix = np.zeros(shape=(len(nodes),len(nodes)))
adj_matrix = pd.DataFrame(adj_matrix, index=nodes, columns=nodes)

i=0
for key in tqdm(nodes):

    if key in keyword_nodes:
        for sublist in keyword_list_reduced:
            if key in sublist:
                for second_key in sublist:
                        adj_matrix[key][second_key] += 1
                        
    elif key in author_nodes:
        for sublist in author_list:
            if key in sublist:
                for second_key in sublist:
                        try: #this is strange
                            adj_matrix[key][second_key] += 1
                        except:
                            pass


for name in tqdm(df_authors.index):
    try:
        counted = Counter(flatten_chain(df_authors['keywords'].loc[name]))
    except:
        continue
    for key,val in counted.items():
        try:
            adj_matrix[name][key] = val
            adj_matrix[key][name] = val
        except:
            continue

np.fill_diagonal(adj_matrix.values, 0)

100%|██████████| 2152/2152 [00:11<00:00, 188.48it/s]
100%|██████████| 144/144 [00:08<00:00, 17.15it/s]


In [91]:
adj_matrix_authors = adj_matrix.loc[author_nodes[0]:,:author_nodes[0]]
df_edges_authors = adj_matrix_authors.rename_axis('source')\
  .reset_index()\
  .melt('source', value_name='weight', var_name='target')\
  .query('source != target')\
  .reset_index(drop=True)
df_edges_authors = df_edges_authors[df_edges_authors['weight']>0].reset_index()
df_edges_authors.drop('index',axis=1,inplace=True)
df_edges_authors.set_index('target',inplace=True)

In [183]:
adj_matrix_tmps = adj_matrix.loc[author_nodes[0]:,author_nodes[0]:]
df_edges_tmp = adj_matrix_tmps.rename_axis('source')\
  .reset_index()\
  .melt('source', value_name='weight', var_name='target')\
  .query('source != target')\
  .reset_index(drop=True)
df_edges_tmp = df_edges_tmp[df_edges_tmp['weight']>0].reset_index()
df_edges_tmp.drop('index',axis=1,inplace=True)
# df_edges_tmp.set_index('target',inplace=True)

In [186]:
sum(df_edges_tmp['weight'])

680.0

In [22]:
df_edges = adj_matrix.rename_axis('source')\
  .reset_index()\
  .melt('source', value_name='weight', var_name='target')\
  .query('source != target')\
  .reset_index(drop=True)
df_edges = df_edges[df_edges['weight']>0].reset_index()
df_edges.drop('index',axis=1,inplace=True)

In [23]:
grouped_source_length = df_edges.groupby(['source']).size()
grouped_target_length = df_edges.groupby(['target']).size()

added_jaccard = []
for i in tqdm(range(len(df_edges))):
    added_jaccard.append(df_edges['weight'].loc[i]/grouped_source_length.loc[df_edges['source'].loc[i]] + df_edges['weight'].loc[i]/grouped_source_length.loc[df_edges['target'].loc[i]])
df_edges['added_jaccard'] = added_jaccard

  0%|          | 0/40774 [00:00<?, ?it/s]

100%|██████████| 40774/40774 [00:02<00:00, 17935.91it/s]


In [24]:
grouped_sum = df_edges.groupby(['source']).sum()['added_jaccard']
grouped_length = df_edges.groupby(['source']).size()

i=0
df_edges['added_jaccard_sum'] = [0]*len(df_edges)
df_edges['length'] = [0]*len(df_edges)

for key in tqdm(df_edges['source']):
    df_edges['added_jaccard_sum'].loc[i] = grouped_sum[key]
    df_edges['length'].loc[i] = grouped_length[key]
    i+=1

df_edges['border_length'] = df_edges['length'].apply(lambda x: int(x) - int(x*0.3))

indices = []
for key in tqdm(list(grouped_length.index)):
    temp = df_edges[df_edges['source'] == key]
    if grouped_length[key] < 2:
        pass
    else:
        temp.sort_values('added_jaccard',inplace=True)
    indices.append(list(temp.iloc[:int(temp['border_length'].mean())].index))
    
indices = flatten_chain(indices)
df_edges_reduced = df_edges.drop(indices)
# # df_edges_reduced = df_edges.drop(df_edges.loc[(df_edges['added_jaccard'] < df_edges['added_jaccard_sum'] * 0.2)].index)

del df_edges_reduced["weight"]
df_edges_reduced.rename({"added_jaccard":'weight'}, axis=1, inplace=True)

  0%|          | 0/40774 [00:00<?, ?it/s]

100%|██████████| 40774/40774 [00:15<00:00, 2558.13it/s]
100%|██████████| 2152/2152 [00:08<00:00, 240.29it/s]


In [25]:
del df_edges["weight"]
df_edges.rename({"added_jaccard":'weight'}, axis=1, inplace=True)

In [65]:
df_names_copy = df_names.reset_index(drop=False)
df_names_copy.set_index('fullname',inplace=True)

In [67]:
df_names_copy.loc[list(df_edges_authors.loc['land use']['source'])]['scopusID']

fullname
Andries Hof        12238851300
Ine Dorresteijn    55225962900
Jana Eichel        55796642600
Name: scopusID, dtype: object

In [73]:
t = 'key chain'
key=''
for part in t.split(' '):
        part = part[0].upper() + part[1:]
        print(part)
        key += part + '+'

key = key[:-1]
print(key)

Key
Chain
Key+Chain


In [133]:
df_edges_authors.loc['land use']['source']

target
land use        Andries Hof
land use    Ine Dorresteijn
land use        Jana Eichel
Name: source, dtype: object

In [136]:
isinstance(df_edges_authors.loc['integrated assessment modelling']['source'], pd.Series)

False

In [135]:
keyword_nodes[3]

'integrated assessment modelling'

In [162]:
def f2(x):
    if isinstance(df_edges_authors.loc[x]['source'],  pd.Series) == True:
        y = df_names_copy.loc[list(df_edges_authors.loc[x]['source'])]['scopusID'].to_list()
    else:
        y = []
        tmp = df_names_copy.loc[df_edges_authors.loc[x]['source']]['scopusID']#
        y.append(tmp)
    return y
    

def query_keyword(key, list1):
    #list1 : list of author scopus ids
    #key: keyword
    key1=''
    if key != '':
        for part in key.split(' '):
            part = part[0].upper() + part[1:]
            key1 += part + '+'

        key1 = key1[:-1]
    else:
        key1 = ''
    string = f"https://www.scopus.com/results/results.uri?sort=plf-f&src=s&sid=f5247eb0cdb48d4556ef7a2ddb1892f5&sot=a&sdt=cl&cluster=scopubyr%2C%222020%22%2Ct%2C%222021%22%2Ct%2C%222022%22%2Ct%2C%222023%22%2Ct%2C%222024%22%2Ct%2Bscoexactkeywords%2C%22{key1}%22%2Ct&sl=40&s="
    for auth in list1:
        string += f"AU-ID%28{auth}%29+OR+"
    string = string[:-4]
    return string

In [None]:
%2c%222024%22%2ct
%2c%222023%22%2ct
%2c%222022%22%2ct
%2c%222021%22%2ct
%2c%222020%22%2ct

In [166]:
df_nodes['View_author_profile'] = ['--']*len(df_nodes)
df_nodes['View_author_profile'].loc[author_nodes] = df_authors['scopusID'].loc[author_nodes].apply(lambda x: f'https://www.scopus.com/authid/detail.uri?authorId={x}')


df_nodes['View_publications'] = ['--']*len(df_nodes)
df_nodes['View_publications'].loc[author_nodes] = df_authors['scopusID'].loc[author_nodes].apply(lambda x: f'https://www.scopus.com/results/results.uri?sort=plf-f&src=s&nlo=&nlr=&nls=&sid=38e25376e6563d5f8d3298de27536bcc&sot=aut&sdt=cl&cluster=scopubyr%2c%222024%22%2ct%2c%222023%22%2ct%2c%222022%22%2ct%2c%222021%22%2ct%2c%222020%22%2ct&sl=17&s=AU-ID%28{x}%29&origin=resultslist&zone=leftSideBar&editSaveSearch=&txGid=1ca2a667cd972701106c4e8248d84d66')

test  = pd.Series(keyword_nodes).apply(lambda x:f2(x))
urls = []
i=0
for keyword in keyword_nodes:
    urls.append(query_keyword(keyword, test[i]))
    i+=1
df_nodes['View_publications'].loc[keyword_nodes] = urls 



def f1(d1):  
    try:
        v = list(d1.values())
        k = list(d1.keys())
        maxkey = int(k[v.index(max(v))])
    except:
        maxkey=1000
    return maxkey

main_subject_detailed_authors = df_subjectAreas.loc[list(df_authors['subjectAreaCount_detailed'].loc[author_nodes].apply(lambda x: f1(x)))]['detail']
main_subject_general_authors = df_subjectAreas.loc[list(df_authors['subjectAreaCount_detailed'].loc[author_nodes].apply(lambda x: f1(x)))]['subject-classification']

main_subject_detailed_keywords = df_subjectAreas.loc[list(df_keywords['subject_areas_code'].loc[keyword_nodes].apply(lambda x: f1(Counter(x))))]['detail']
main_subject_general_keywords = df_subjectAreas.loc[list(df_keywords['subject_areas_code'].loc[keyword_nodes].apply(lambda x: f1(Counter(x))))]['subject-classification']


df_nodes['main_subject'] = [0]*len(df_nodes)
df_nodes['main_subject'].loc[author_nodes] = list(main_subject_general_authors)
df_nodes['main_subject'].loc[keyword_nodes] = list(main_subject_general_keywords)

df_nodes['detailed_main_subject'] = [0]*len(df_nodes)
df_nodes['detailed_main_subject'].loc[author_nodes] = list(main_subject_detailed_authors)
df_nodes['detailed_main_subject'].loc[keyword_nodes] = list(main_subject_detailed_keywords)

df_nodes['total_citations'] = [0]*len(df_nodes)
df_nodes['total_citations'].loc[keyword_nodes] = df_keywords['total_citations'].loc[keyword_nodes]
df_nodes['total_citations'].loc[author_nodes] = df_authors['citationCount'].loc[author_nodes]

df_nodes['num_of_papers'] = [0]*len(df_nodes)
df_nodes['num_of_papers'].loc[keyword_nodes] = df_keywords['num_of_papers'].loc[keyword_nodes]
df_nodes['num_of_papers'].loc[author_nodes] = df_authors['publishedArticles'].apply(lambda x: len(x)).loc[author_nodes]

df_nodes['num_of_authors'] = [0]*len(df_nodes)
df_nodes['num_of_authors'].loc[keyword_nodes] = df_keywords['num_of_authors'].loc[keyword_nodes]


In [169]:
network_graph = nx.from_pandas_edgelist(df_edges, source='source', target='target', edge_attr=['weight'])
network_graph.add_nodes_from((n, dict(d)) for n, d in df_nodes.iterrows())

#community detection using louvain algorithm
partion = community_louvain.best_partition(network_graph)
nx.set_node_attributes(network_graph, partion, 'community')

nx.write_gexf(network_graph,'network_testing_18042024.gexf')
