In [1]:
API_KEY = "PWs3U9oSnBvIsW6tn8uuadoqMRHbUFfRDpAeyI3IFUR6STKBF0tD"

# Search Parameters

In [2]:
seed = 70 # seed for NMF topic model
num_topics = 12
labels = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
vis_seed = 6 # seed for t-SNE visualization
vis_angle = 135 # rotation angle for visualization

# Import Modules

In [3]:
import requests
import time
import gensim
from gensim.utils import simple_preprocess
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from IPython.display import display
from collections import defaultdict
import json

import seaborn as sns
sns.set('paper')

import logging
logging.getLogger().setLevel(logging.INFO)

# some python 3 trickery
import sys
if sys.version_info[0] >= 3:
    unicode = str

# Useful Functions

In [4]:
def prepare_fig(w=1, h=None):
    if h is None: h = w
    figsize = (6 * w, 3 * h)
    sns.set(rc={'figure.figsize': figsize})
    fig = plt.figure(figsize=figsize)
    plt.clf()
    return fig

def top_k(mapping, k=10):
    return sorted(mapping.keys(), key=lambda x: mapping[x])[::-1][:k]

pd.set_option('display.max_rows', 250)

def plot_statistic(fun):
    count = defaultdict(int)

    for row in fun:
        if row:
            count[unicode(row)] += 1

    top_keys = top_k(count, 50)

    prepare_fig(1, 4)
    plt.xlabel("No. publications")
    plt.barh(
        range(len(top_keys)),
        [count[a] for a in top_keys])
    plt.yticks(
        range(len(top_keys)), 
        [key[:50] for key in top_keys])
    plt.show()

# Perform Query

In [5]:
# Some query examples
# "protein AND (molecular dynamics) AND year_published:[2000 TO 2018]"
# "author.display_name:(Jingjie AND Yeo) AND year_published:[2000 TO 2018]"
# "(autonomous driving) OR (self-driving car) OR (robotic car)"
# "(silk) OR (collagen) AND (biomaterial) AND (molecular dynamics)"
# "(community participatory research) AND year_published:[2017 TO 2018]"

url = 'https://api.lens.org/scholarly/search'

request_body = '''{
	"query": {
		"query_string": {
			"query": "(LCA) AND (biomaterial)",
            "default_operator": "and"
		}
	},
    "languages": "en",
    "scroll": "1m",
    "size": 500,
    "sort": [
            {
            "year_published": "desc"
            }
     ]
}'''

headers = {'Authorization': API_KEY, 'Content-Type': 'application/json'}

# Recursive function to scroll through paginated results
def scroll(scroll_id, url, request_body, headers, df_raw):
    print(df_raw.head())
    # Change the request_body to prepare for next scroll api call
    # Make sure to append the include fields to make faster response
    if scroll_id is not None:
        request_body = '''{"scroll_id": "%s"}''' % (scroll_id)

    # make api request
    response = requests.post(url, data=request_body, headers=headers)

    # If rate-limited, wait for n seconds and proceed the same scroll id
    # Since scroll time is 1 minutes, it will give sufficient time to wait and proceed
    if response.status_code == requests.codes.too_many_requests:
        time.sleep(8)
        return scroll(scroll_id, url, request_body, headers, df_raw)

    # If the response is not ok here, better to stop here and debug it
    elif response.status_code != requests.codes.ok:
        print(response.json())

    # If the response is ok, do something with the response, take the new scroll id and iterate
    else:
        json = response.json()

        # End recursion once end of search is reached
        if not json['data']:
            return df_raw

        scroll_id = json['scroll_id'] # Extract the new scroll id from response

        # Store the data into a dataframe
        df_raw = pd.concat([df_raw,pd.DataFrame.from_dict(json['data'])])

        # Keep scrolling
        return scroll(scroll_id, url, request_body, headers, df_raw)

# start recursive scrolling
df_raw = pd.DataFrame()
df_raw = scroll(None, url, request_body, headers, df_raw)
# Raw Data
df_raw = df_raw.reset_index() # make sure indexes pair with number of rows

# Filter journal articles only
df = df_raw[df_raw['publication_type'].str.contains('journal article', na=False)]
df = df.reset_index() # make sure indexes pair with number of rows

Empty DataFrame
Columns: []
Index: []
               lens_id                                              title  \
0  043-381-014-847-221  FEK self-assembled peptide hydrogels facilitat...   
1  106-348-768-834-151  Sustainability assessment of palm oil-based re...   
2  009-418-160-779-435  Impact of nanoparticles on the environmental s...   
3  001-235-163-928-974  A Flexible Anti-Biofilm Hygiene Coating for Wa...   
4  039-765-729-676-647  Life cycle assessment of auto-tropically culti...   

  publication_type  year_published                    date_published  \
0  journal article          2022.0  2022-03-18T00:00:00.000000+00:00   
1  journal article          2022.0  2022-12-01T00:00:00.000000+00:00   
2  journal article          2022.0                               NaN   
3  journal article          2022.0  2022-07-31T00:00:00.000000+00:00   
4  journal article          2022.0  2022-08-25T00:00:00.000000+00:00   

  date_published_parts                           created  \
0     

In [6]:
# Post-process data to obtain author information        
from itertools import combinations

# Get each manuscript's author names as a Pandas Series
authors = pd.DataFrame()
for a in df['authors'].dropna():
    try:
        author_list = []
        for b in a:
            try:
                author_list.append(b['last_name'] + ', ' + b['first_name'])
            except Exception:
                continue
        
        author = []
        author.append(author_list)
        authors = pd.concat([authors,pd.Series(author)], ignore_index=True)
    except Exception:
        continue

# We will extract author connections
# First, convert to Pandas Series
authors = authors.squeeze()

# Create a list of these authors
authors_flat = [
    author
    for authors in list(authors.dropna())
    for author in authors
]

# Permute all combinations of author pairs in each manuscript
author_connections = list(
    map(lambda x: list(combinations(x[::-1], 2)), authors)
)

# Flatten into a list
flat_connections = [item for sublist in author_connections for item in sublist]

# Create a dataframe with the connections
df_connect = pd.DataFrame(flat_connections, columns=["From", "To"])
df_graph = df_connect.groupby(["From", "To"]).size().reset_index()
df_graph.columns = ["From", "To", "Count"]


In [7]:
# Create graphs to connect each manuscript's authors
import networkx as nx
from collections import Counter

G = nx.from_pandas_edgelist(
    df_graph, source="From", target="To", edge_attr="Count"
)

# Limit to top authors, please edit accordingly
top_authors = pd.DataFrame.from_records(
    Counter(authors_flat).most_common(20), columns=["Name", "Count"]
)

top_nodes = (n for n in list(G.nodes()) if n in list(top_authors["Name"]))

G_top = G.subgraph(top_nodes)

for n in G_top.nodes():
    G_top.nodes[n]["publications"] = int(
        top_authors[top_authors["Name"] == n]["Count"]
    )

# Plotting using Bokeh

running on colab notebooks might require one to install bokeh and networkx package <br>run "!pip install networkx==v2.7.1 bokeh==v3.0.3" in a cell

In [8]:
# networkx v2.7.1
# bokeh v3.0.3
from bokeh.io import output_notebook, show
from bokeh.plotting import from_networkx, figure
# for rendering
from bokeh.models import (BoxSelectTool, Circle, HoverTool, MultiLine, NodesAndLinkedEdges, TapTool, Range1d)
# for color
from bokeh.palettes import TolYlOrBr9, Blues9, Category10_9, Dark2_8
from bokeh.transform import linear_cmap
# for labels
from bokeh.models import ColumnDataSource, LabelSet


"""
Render an interactive using Bokeh

G: Networkx graph object to be rendered
layout: str, specify the layout of the graph. one of "spring", "circular", "random", "kamada_kawai"
color: color palette from bokeh, can be found on bokeh docs at https://docs.bokeh.org/en/latest/docs/reference/palettes.html

returns: bokeh plot object.
"""
def bokeh_graph_plot(G,layout = "spring", color_palette=Dark2_8):

     # convert the labels of nodes to integers
    G_num = nx.convert_node_labels_to_integers(G, label_attribute="Name")

    # calculate the spread of the number of publications
    num_pubs_list = [G_num.nodes().get(n_idx)['publications'] for n_idx in G_num.nodes()]
    num_pub_range = max(num_pubs_list) - min(num_pubs_list)
    graph_scale = max(2,num_pub_range)

    # to adjust node size
    adj_node_size = {n_idx:((max(2,G_num.nodes().get(n_idx)['publications']-min(num_pubs_list)))/num_pub_range)*20 for n_idx in G_num.nodes()}
    nx.set_node_attributes(G_num,name="adj_node_size",values=adj_node_size)

    #to color nodes based on connected components
    node_color = {}
    connected_components = list(nx.connected_components(G_num))
    for idx,component in enumerate(connected_components):
        color = color_palette[int((idx/len(connected_components))*len(color_palette))]
        for n in component:
            node_color[n] = color
    nx.set_node_attributes(G_num,name="node_color",values=node_color)

    # to annotate nodes with fname
    lname = {n_idx:G_num.nodes().get(n_idx)['Name'].split(',')[0] for n_idx in G_num.nodes()}
    nx.set_node_attributes(G_num,name="lname",values=lname)

    # to adjust the width of edges
    num_citations_list = [G_num.edges().get(n_idx)["Count"] for n_idx in G_num.edges()]
    num_citations_range = max(num_citations_list)-min(num_citations_list)
    adj_edge_size = {n_idx:((max(1,G_num.edges().get(n_idx)["Count"]-min(num_citations_list)))/num_citations_range)*5 for n_idx in G_num.edges()}
    nx.set_edge_attributes(G_num, name="adj_edge_width", values=adj_edge_size)

    # basic graph renderer
    if layout == "spring":
        graph_renderer = from_networkx(G_num, nx.spring_layout, scale=graph_scale, center=(0,0), k=0.5)
    elif layout == "circular":
        graph_renderer = from_networkx(G_num, nx.circular_layout, scale=2, center=(0,0))
    elif layout == "kamada_kawai":
        graph_renderer = from_networkx(G_num, nx.kamada_kawai_layout, scale=2, center=(0,0))
    else:
        graph_renderer = from_networkx(G_num, nx.random_layout,center=(0,0))

    # setup plot

    plot = figure(title="Publication network analysis",
                  min_width = 500, min_height = 500,
                  # x_range=Range1d(-2.5,2.5), y_range=Range1d(-2.5,2.5),
                  tools="pan,save,reset,wheel_zoom,box_zoom", active_scroll = "wheel_zoom")
    # for tool tip of node
    plot.add_tools(HoverTool(tooltips=[('Name','@Name'),('Publications','@publications')],
                             renderers=[graph_renderer.node_renderer]))
    # for tool tip of edge
    plot.add_tools(HoverTool(tooltips=[('Count','@Count')],
                             renderers=[graph_renderer.edge_renderer],
                             line_policy="interp"))
    # for selection options (mouse click and lasso select)
    plot.add_tools(TapTool(), BoxSelectTool())


    # nodes

    node_size_attrib = "adj_node_size"
    node_color_attrib = "node_color"

    # render the nodes
    graph_renderer.node_renderer.glyph = Circle(size=node_size_attrib, fill_color=node_color_attrib)
    graph_renderer.node_renderer.selection_glyph = Circle(size=15, fill_color='red')
    graph_renderer.node_renderer.hover_glyph = Circle(size=15, fill_color='black')

    # Edges

    edge_width_attrib = "adj_edge_width"

    # render the edges
    graph_renderer.edge_renderer.glyph = MultiLine(line_color = "#CCCCCC", line_alpha=0.5, line_width=edge_width_attrib)
    graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color = 'red', line_alpha=0.5, line_width=edge_width_attrib)
    graph_renderer.edge_renderer.hover_glyph = MultiLine(line_color = 'black', line_alpha=0.5, line_width=edge_width_attrib)

    # selection and insepection methods, gets the node and the edges assoc with the node
    graph_renderer.selection_policy = NodesAndLinkedEdges()
    graph_renderer.inspection_policy = NodesAndLinkedEdges()


    # add labels
    label_attrib = "lname"
    x, y = zip(*graph_renderer.layout_provider.graph_layout.values())
    node_labels = list(G_num.nodes())
    source = ColumnDataSource({'x':x,'y':y,
                               'name':[n_data[1][label_attrib] for n_data in G_num.nodes().data()]})
    labels = LabelSet(x='x', y='y', text='name', source=source, background_fill_color='white', text_font_size='10px', background_fill_alpha=.7)
    plot.renderers.append(labels)

    # show plot
    plot.renderers.append(graph_renderer)

    return plot

Similarly, the plot uses networkx layouts, where the included options are spring, circular, kamada_kawai and random. Nodes that belong to the same connected component of the graph are colored the same and the node size is reflective of the number of publications by the person. The edge width is reflective of the number of shared publications.<br><br>Hovering over nodes would display the full name and the number of publication. Hovering over edges show the number of shared publications.<br><br>There are also some tools for interactivity, which can be enabled and disabled in the tool-bar on the right side of the graph. In order from top to bottom, "pan", "box zoom", "box select", "mouse wheel zoom", "select", "download", "reset", "hover for nodes", "hover for edges".

In [9]:
from bokeh.io import output_file

plot = bokeh_graph_plot(G_top,layout="spring")

# Cant seem to get mouse wheel zoom on local jupyter notebook
# comment out line below to plot on browser to allow wheel zoom
output_notebook()

show(plot)

![](bokeh_graph_plot.png)