# Search Parameters

In [None]:
seed = 70 # seed for NMF topic model
num_topics = 12
labels = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
vis_seed = 6 # seed for t-SNE visualization
vis_angle = 135 # rotation angle for visualization

# Import Modules

In [None]:
import requests
import time
import gensim
from gensim.utils import simple_preprocess
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.decomposition
import sklearn.feature_extraction
from wordcloud import WordCloud
from IPython.display import display
from collections import defaultdict

import seaborn as sns
sns.set('paper')

import logging
logging.getLogger().setLevel(logging.INFO)

# some python 3 trickery
import sys
if sys.version_info[0] >= 3:
    unicode = str


# Useful Functions

In [None]:
def prepare_fig(w=1, h=None):
    if h is None: h = w
    figsize = (6 * w, 3 * h)
    sns.set(rc={'figure.figsize': figsize})
    fig = plt.figure(figsize=figsize)
    plt.clf()
    return fig

def top_k(mapping, k=10):
    return sorted(mapping.keys(), key=lambda x: mapping[x])[::-1][:k]

pd.set_option('display.max_rows', 250)

def plot_statistic(fun):
    count = defaultdict(int)

    for row in fun:
        if row:
            count[unicode(row)] += 1

    top_keys = top_k(count, 50)

    prepare_fig(1, 4)
    plt.xlabel("No. publications")
    plt.barh(
        range(len(top_keys)),
        [count[a] for a in top_keys])
    plt.yticks(
        range(len(top_keys)), 
        [key[:50] for key in top_keys])
    plt.show()

# Perform Query

In [None]:
# Some query examples
# "protein AND (molecular dynamics) AND year_published:[2000 TO 2018]"
# "author.display_name:(Jingjie AND Yeo) AND year_published:[2000 TO 2018]"
# "(autonomous driving) OR (self-driving car) OR (robotic car)"
# "(silk) OR (collagen) AND (biomaterial) AND (molecular dynamics)"
# "(community participatory research) AND year_published:[2017 TO 2018]"

url = 'https://api.lens.org/scholarly/search'

request_body = '''{
	"query": {
		"query_string": {
			"query": "(silk) OR (collagen) AND (biomaterial) AND (molecular dynamics)",
            "default_operator": "and"
		}
	},
    "languages": "en",
    "scroll": "1m",
    "size": 500,
    "sort": [
            {
            "year_published": "desc"
            }
     ]
}'''

headers = {'Authorization': 'Bearer YOUR_API_KEY', 'Content-Type': 'application/json'}

df_raw = pd.DataFrame()
# Recursive function to scroll through paginated results
def scroll(scroll_id, url, request_body, headers):
    global df_raw
    # Change the request_body to prepare for next scroll api call
    # Make sure to append the include fields to make faster response
    if scroll_id is not None:
        request_body = '''{"scroll_id": "%s"}''' % (scroll_id)

    # make api request
    response = requests.post(url, data=request_body, headers=headers) 

    # If rate-limited, wait for n seconds and proceed the same scroll id
    # Since scroll time is 1 minutes, it will give sufficient time to wait and proceed
    if response.status_code == requests.codes.too_many_requests:
        time.sleep(8)
        scroll(scroll_id, url, request_body, headers)
  
    # If the response is not ok here, better to stop here and debug it
    elif response.status_code != requests.codes.ok:
        print(response.json())
  
    # If the response is ok, do something with the response, take the new scroll id and iterate
    else:
        json = response.json()

        # End recursion once end of search is reached
        if not json['data']:
            return

        scroll_id = json['scroll_id'] # Extract the new scroll id from response
        
        # Store the data into a dataframe
        print(json['data'])
        df_raw = df_raw.append(pd.DataFrame.from_dict(json['data'])) 
        
        # Keep scrolling
        scroll(scroll_id, url, request_body, headers)

# start recursive scrolling
scroll(None, url, request_body, headers)

# Raw Data
df_raw = df_raw.reset_index() # make sure indexes pair with number of rows

# Filter journal articles only
df = df_raw[df_raw['publication_type'].str.contains('journal article', na=False)] 
df = df.reset_index() # make sure indexes pair with number of rows

In [None]:
# Post-process data to obtain author information        
from itertools import combinations

# Get each manuscript's author names as a Pandas Series
authors = pd.DataFrame()
for a in df['authors'].dropna():
    try:
        author_list = []
        for b in a:
            try:
                author_list.append(b['last_name'] + ', ' + b['first_name'])
            except Exception:
                continue
        
        author = []
        author.append(author_list)
        authors = authors.append(pd.Series(author), ignore_index=True)
    except Exception:
        continue

# We will extract author connections
# First, convert to Pandas Series
authors = authors.squeeze()

# Create a list of these authors
authors_flat = [
    author
    for authors in list(authors.dropna())
    for author in authors
]

# Permute all combinations of author pairs in each manuscript
author_connections = list(
    map(lambda x: list(combinations(x[::-1], 2)), authors)
)

# Flatten into a list
flat_connections = [item for sublist in author_connections for item in sublist]

# Create a dataframe with the connections
df_connect = pd.DataFrame(flat_connections, columns=["From", "To"])
df_graph = df_connect.groupby(["From", "To"]).size().reset_index()
df_graph.columns = ["From", "To", "Count"]


In [None]:
# Create graphs to connect each manuscript's authors
import networkx as nx
from collections import Counter

G = nx.from_pandas_edgelist(
    df_graph, source="From", target="To", edge_attr="Count"
)

# Limit to top authors, please edit accordingly
top_authors = pd.DataFrame.from_records(
    Counter(authors_flat).most_common(20), columns=["Name", "Count"]
)

top_nodes = (n for n in list(G.nodes()) if n in list(top_authors["Name"]))

G_top = G.subgraph(top_nodes)

for n in G_top.nodes():
    G_top.nodes[n]["publications"] = int(
        top_authors[top_authors["Name"] == n]["Count"]
    )

edges = G_top.edges()
weights = [G_top[u][v]['Count'] for u,v in edges]
nx.draw_circular(G_top, with_labels=True, width=weights)

In [None]:
# TBD: Nicer visualizations using NXVIZ
# Currently incomplete so YMMV
!pip install nxviz
import nxviz as nv
from nxviz import annotate

ax = nv.circos(
    G_top,
    group_by="publications",
    node_color_by="publications"
)

annotate.circos_group(G_top, group_by="publications")