# Module 2 Assignment

## Processing Data

In this section we load in the data and make it ready for NA and NLP.

### Importing Packages and Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
#Reading the two Scopus CSV data files 
scopus2022 = pd.read_csv('scopus 2022 2021.csv',  sep = ',')
scopus = pd.read_csv('scopus.csv',  sep = ',')

In [None]:
#Appending the two data files 
data = scopus2022.append(scopus, ignore_index=True)

In [None]:
#Getting an overview of the data
data.info()

In [None]:
#Getting an overview of the data
data.columns

In [None]:
#Specify data columns for Network Analysis
data_NA = data[['Authors', 'Author(s) ID', 'Title', 'Year', 'Affiliations', 'Cited by', 'References', 'Source title',]]

#Checking NA data
data_NA.head()

In [None]:
#Checking NA data
data_NA.info()

In [None]:
#Checking missing author ID
data_NA[data_NA['Author(s) ID'].isna()]

In [None]:
#Dropping row with mising author ID
data_NA.dropna(subset=['Author(s) ID'], inplace=True)

In [None]:
#Specify data columns for Natural Language Processing
data_NLP = data[['Authors', 'Author(s) ID','Title', 'Abstract','Year', 'Source title', 'Author Keywords']]


In [None]:
#Adding together title and abstract for NLP
data_NLP ['text'] = data_NLP['Title'] + '. ' + data_NLP['Abstract']

In [None]:
#Checking NLP data
data_NLP.head()

## Network Analysis

In this section we look into networks between authors. The aim is to identify co-authorships and locate important autors as well as detecting communities.

It is an undirecet network since we have authors working on papers together. Also, the nature of the network is bipartite as the nodes can be authors or papers. Hence, authors are connected if they have been working on the same paper. Furthermore, the network should not consist of self-loops, since the authors cannot co-author with themselves. 

### Importing packages and preparing data for Network Analysis

In [None]:
# pip install nxviz

In [None]:
# pip install networkx

In [None]:
#pip install python-louvain

In [None]:
# pip install holoviews

In [None]:
# pip install bokeh

In [None]:
#pip install CircosPlot

In [None]:
# pip install -qq holoviews

In [None]:
# pip install -qq -U bokeh

In [None]:
# pip install -qq datashader

In [None]:
# pip install xarray

In [None]:
# pip install datashader

In [None]:
#pip install scikit-image

In [None]:
#Importing packages for network analysis
import networkx as nx
import nxviz as nv
import matplotlib.pyplot as plt 

#Import the libraries and link to the bokeh backend
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.plotting import show

#Library for community creation
from community import community_louvain 

In [None]:
#Splitting up the data so each author has a row for each title he/she has been working on
data_NA = data_NA.assign(Authors=data_NA['Authors'].str.split(',')).explode('Authors', ignore_index=True)

In [None]:
#Checking the split data column
data_NA['Authors']

In [None]:
#Adding a ID-number for each author 
data_NA['Author_ID'] = pd.factorize(data_NA['Authors'])[0]

In [None]:
#Checking the authors that have been working on most titles. 
data_NA['Authors'].value_counts(ascending=False).nlargest(20)

In [None]:
#Dropping row with "No author name available"
data_NA['Authors'] = data_NA['Authors'].replace('[No author name available]', ' ')
data_NA['Authors'] = data_NA['Authors'].replace(' ', np.nan)
data_NA.dropna(subset=['Authors'], inplace=True)

In [None]:
#Checking titles with most authors
data_NA['Title'].value_counts(ascending=False).nlargest(20)

### Co-authorship network

Here we specify a network of co-authorships, specifying edges and nodes. The network shows which authors have been working on papers together. 

In [None]:
#Greating edges between authors working on the same title
data_NA_select = data_NA[['Authors', 'Author_ID', 'Title']]
edges = pd.merge(data_NA_select, data_NA_select, on='Title')

#Removing self-loops
edges = edges[edges.Author_ID_x != edges.Author_ID_y]

#Checking the edges
edges.head()

In [None]:
#Grouping authors to aggregate muliple co-occurence and generating a weight. To see how often two autors have been working together
edges = edges.groupby(['Author_ID_x', 'Author_ID_y']).size().reset_index()

#Checking the edges
edges.head()

In [None]:
#Checking how many co-authors the authors have. 
edges[0].value_counts()

In [None]:
#Renaming the new column for weight
edges.rename({0:'weight'}, axis = 1, inplace=True)

In [None]:
#Checking the number of edges
len(edges)

In [None]:
# Create network object from pandas edgelist
G = nx.from_pandas_edgelist(edges, source='Author_ID_x', target='Author_ID_y', edge_attr='weight', create_using=nx.Graph())

In [None]:
#Nodes - Creating node-attribute dictionary 
node_attributes = data_NA_select[['Author_ID', 'Authors']].set_index('Author_ID').drop_duplicates().to_dict('index')

In [None]:
nx.set_node_attributes(G, {G.degree(): 'degree'})

In [None]:
nx.set_node_attributes(G, node_attributes)

In [None]:
#Checking the number of nodes
len(G.nodes())

In [None]:
#Checking the number of edges
len(G.edges())

In [None]:
# G.degree()

In [None]:
#Subset the graph keeping only nodes with degree > 1
G = nx.subgraph(G, [n for n,d in G.degree() if d > 1])

In [None]:
len(G.nodes())

In [None]:
len(G.edges())

#### Centrality

In [None]:
#Calculate centralities
centrality_dgr = nx.degree_centrality(G)
centrality_eig = nx.eigenvector_centrality_numpy(G, weight = 'weight')
centrality_bet = nx.betweenness_centrality(G)


In [None]:
# Setting centralities as as attributes of the Graph
nx.set_node_attributes(G, centrality_dgr, 'dgr')
nx.set_node_attributes(G, centrality_eig, 'eig')
nx.set_node_attributes(G, centrality_bet, 'bet')

In [None]:
#Turn Graph object to DataFrame
nodes_df = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index')

In [None]:
#Checking nodes_df
nodes_df.head()

##### Degree Centrality
Degree Centrality is a measure of how many ties a nodes has. So a author that have co-authored with more authors will have a higher centrality degree.

In [None]:
#Closer look at degree centrality 
nodes_df.sort_values('dgr', ascending=False)[:10]

##### Eigenvector Centrality
Eigenvector Centrality is a measure of the influence a node has on a network - it takes into account the centrality of the autors connections.

In [None]:
#Closer look at eigenvector centrality 
nodes_df.sort_values('eig', ascending=False)[:10]

##### Betweenness Centrality 
Betweenness Centrality is a measure of how often a author are in the shortes path between two other authors. It has to do with information passing between others. 

In [None]:
#Closer look at betweennes centrality 
nodes_df.sort_values('bet', ascending=False)[:10]

#### Communities

In [None]:
#Creating partitions
partition = community_louvain.best_partition(G)
nx.set_node_attributes(G, partition, 'partition')

In [None]:
#Add to node attributes
nx.set_node_attributes(G, partition, 'partition')

In [None]:
#Checking the dataframe after adding the partition column
nodes_df = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index')
nodes_df.head()

In [None]:
#Checking the 20 largest communities
nodes_df.partition.value_counts()[:20]

In [None]:
#Checking the number of partition (communities)
nodes_df.partition.nunique()

In [None]:
#Taking out the people from the 10 largest communities to create a plot
top10_com = nodes_df.partition.value_counts()[:10].index 

#Creating nodes for 10 largest communities
top10_com_nodes = nodes_df[nodes_df['partition'].isin(top10_com)].index

#Create a subgraph 
g_sub = nx.subgraph(G, top10_com_nodes)

In [None]:
#Creating a dataframe of the top 10 communities 
nodes_df_top10 = nodes_df[nodes_df['partition'].isin(top10_com)]

In [None]:
nodes_df_top10

In [None]:
#Looking at the 5 most important autors within the communities based on eigenvector centrality
top_authors = nodes_df_top10.groupby('partition')['eig'].nlargest(5).reset_index()

In [None]:
top_authors

In [None]:
#Adding back ID's and Names 
top_authors.rename({'level_1':'Author_ID'}, axis=1, inplace=True)
top_authors = pd.merge(top_authors, data_NA_select[['Authors','Author_ID']].drop_duplicates(), on='Author_ID', how='inner')

In [None]:
top_authors

#### Visualisation of Co-author network

##### Communities plot

In [None]:
#Specifying central nodes for graph based on eigenvector 
top_central_nodes = nodes_df[nodes_df['eig'] > nodes_df['eig'].quantile(0.99)].index

In [None]:
#Creating a subset graph
g_sub = nx.subgraph(G, top_central_nodes)

In [None]:
# Create and save a layout.
g_layout = nx.layout.spring_layout(g_sub) 
g_plot = hv.Graph.from_networkx(g_sub, g_layout).opts(tools=['hover'], node_color='partition')
labels = hv.Labels(g_plot.nodes, ['x', 'y'], 'Authors')

In [None]:
# Import the libraries and link to the bokeh backend
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.plotting import show
kwargs = dict(width=800, height=800, xaxis=None, yaxis=None)
opts.defaults(opts.Nodes(**kwargs), opts.Graph(**kwargs))

In [None]:
# pip install scipy

In [None]:
# make the plot
from holoviews.operation.datashader import datashade, bundle_graph
bundled = bundle_graph(g_plot)

In [None]:
# show the plot
show(hv.render(bundled * labels.opts(text_font_size='6pt', text_color='white', bgcolor='gray')))

##### CircosPlot

In [None]:
#Creating a sub_edge which implies that the author must have worked together on at least 2 titles
sub_edges = edges[edges['weight']>1]

In [None]:
#Checking the number og sub_edges
len(sub_edges)

In [None]:
# Create network object from pandas edgelist
# G = nx.from_pandas_edgelist(sub_edges, source='Author_ID_x', target='Author_ID_y', edge_attr='weight', create_using=nx.Graph())

In [None]:
# Subset the graph keeping only nodes with degree > 1
# G = nx.subgraph(G, [n for n,d in G.degree() if d > 1])

In [None]:
#Making a circos plot for the general network with weight above 1
#Drawing a network 
# cp = nv.CircosPlot(G)
# cp.draw
# plt.show()


##### Centrality plots

In [None]:
# Setting the default figure size
defaults = dict(width=750, height=750, padding=0.1,
                xaxis=None, yaxis=None)
hv.opts.defaults(
    opts.EdgePaths(**defaults), opts.Graph(**defaults), opts.Nodes(**defaults))

In [None]:
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.plotting import show

In [None]:
# Create and save layout for centrality plots
G_layout = nx.layout.kamada_kawai_layout(G)

In [None]:
#Degree centrality plot 
g_plot_dgr = hv.Graph.from_networkx(G, G_layout).opts(tools=['hover'],
                                                                        directed=False,
                                                                        edge_alpha=0.25,
                                                                        node_size='dgr',
                                                                        node_color='Authors', cmap='Set1',
                                                                        legend_position='right'
                                                                        )


In [None]:
show(hv.render(g_plot_dgr))

In [None]:
#Eigenvector centrality plot 
g_plot_eig = hv.Graph.from_networkx(G, G_layout).opts(tools=['hover'],
                                                                        directed=False,
                                                                        edge_alpha=0.25,
                                                                        node_size='eig',
                                                                        node_color='Authors', cmap='Set1',
                                                                        legend_position='right'
                                                                        )


In [None]:
show(hv.render(g_plot_eig))

In [None]:
#Betweenness centrality plot 
g_plot_bet = hv.Graph.from_networkx(G, G_layout).opts(tools=['hover'],
                                                                        directed=False,
                                                                        edge_alpha=0.25,
                                                                        node_size='bet',
                                                                        node_color='Authors', cmap='Set1',
                                                                        legend_position='right'
                                                                        )


In [None]:
show(hv.render(g_plot_bet))

In [None]:
G_layout2 = nx.layout.fruchterman_reingold_layout(G) 

In [None]:
for  i in centrality_dgr:
  centrality_dgr[i] = centrality_dgr[i]*100

In [None]:
g_plot_dgr2 = hv.Graph.from_networkx(G, G_layout2).opts(tools=['hover'],
                                                  node_size='dgr')

show(hv.render(g_plot_dgr2))

In [None]:
for  i in centrality_eig:
  centrality_eig[i] = centrality_eig[i]*100

In [None]:
g_plot_eig2 = hv.Graph.from_networkx(G, G_layout).opts(tools=['hover'],
                                                  node_size='eig' )

show(hv.render(g_plot_eig2))

In [None]:
for  i in centrality_bet:
  centrality_bet[i] = centrality_bet[i]*100

In [None]:
nx.set_node_attributes(G, centrality_bet, 'bet')

g_plot_bet2 = hv.Graph.from_networkx(G, G_layout).opts(tools=['hover'],
                                                  node_size='bet' )

show(hv.render(g_plot_bet2))

##### Community plot 

In [None]:
nx.set_node_attributes(G, partition, 'partition')

g_plot_par = hv.Graph.from_networkx(G, G_layout2).opts(tools=['hover'],
                                                  #node_size='cent_degree', 
                                                  node_color='partition', cmap=plt.cm.Set1,
                                                  legend_position='right')

show(hv.render(g_plot_par))

#### Assortiativity

In [None]:
#Assortiativity
# G_friendship = nx.from_pandas_adjacency(mat_friendship, create_using=nx.DiGraph)
# nx.attribute_assortativity_coefficient(G_friendship, 'seniority')

### The network over time (evolvement of the network)