In [1]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
import networkx as nx

from collections import defaultdict

import scipy.sparse as spsparse

import pyscisci.all as pyscisci


In [2]:
# set this path to where the MAG database is stored
path2mag = '/home/ajgates/MAG'
path2mag = '/users/hgt6rn/Documents/DataSets/MAG'


mymag = pyscisci.MAG(path2mag, database_extension='hdf', keep_in_memory=False, show_progress=True) 
# set keep_in_memory=False if you want to load the database each time its needed 
# otherwise keep_in_memory=True will keep each database in memory after its loaded

# global_filter will only load publications that match the filtering criteria

In [3]:
# we will focus on a seminal paper that both measures and embodies interdisciplinarity:

# A general framework for analysing diversity in science, technology and society
# Andrew Stirling

focus_publication_id = 2110158660


# the idea here is to understand the interdisciplinary impact of the focus publication 

# to do this, we take all publications that cite the focus publication, and ask how these are related to eachother
# by making the co-citation network
# clusters/communities in this network reflect different home disciplines of the publications and 
# reflect the interdisciplinary nature of our original focus publication

In [5]:
# First we load all of the publications citing our focus publication
filter_dict = {'CitedPublicationId':np.sort([focus_publication_id])}
citing_focus = mymag.load_references(filter_dict=filter_dict)

focus_citing_pubs = np.sort(citing_focus['CitingPublicationId'].unique())

# Then we need all of the publications that cite these publications
pub2ref = mymag.load_references(filter_dict={'CitedPublicationId':focus_citing_pubs})

print("{0} citing publications produce {1} co-cited publications".format(pub2ref['CitingPublicationId'].nunique(), 
                                                                         pub2ref['CitedPublicationId'].nunique()) )

Loading pub2ref:   0%|          | 0/186 [00:00<?, ?it/s]

Loading pub2ref:   0%|          | 0/186 [00:00<?, ?it/s]

11516 citing publications produce 427 co-cited publications


In [6]:
# now lets get the publication information
pub = mymag.load_publications(filter_dict={'PublicationId':np.sort(pub2ref['CitedPublicationId'].unique())})
pub

Loading Publications:   0%|          | 0/132 [00:00<?, ?it/s]

Unnamed: 0,PublicationId,Year,JournalId,FamilyId,Doi,Title,Date,Volume,Issue,FirstPage,LastPage,DocSubTypes,DocType
351727,2356497335,2016.0,138541872.0,,10.3743/KOSIM.2016.33.1.007,topic modeling based interdisciplinarity measu...,2016-03-30,33,1,7,32,,j
731605,2921285431,2019.0,148561398.0,,10.1007/S11192-019-03067-2,finding high impact interdisciplinary users ba...,2019-05-01,119,2,1017,1035,,j
1658733,2094373579,2013.0,,,10.1109/SOCIALCOM.2013.49,religious politicians and creative photographe...,2013-09-08,,,303,310,,c
769247,2086986353,2013.0,39307421.0,,10.1016/J.TECHFORE.2012.12.005,social capital absorptive capability and firm ...,2013-09-01,80,7,1261,1270,,j
1236249,3101642834,2019.0,196734849.0,3.101643e+09,10.1038/S41598-019-38869-0,assessing diversity in multiplex networks,2019-03-14,9,1,4511,4511,,j
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1073442,3123394243,2017.0,,3.123394e+09,10.1002/ASI.23631,strategic intelligence on emerging technologie...,2017-01-01,68,1,214,233,,c
1084441,2175567070,2016.0,9731383.0,,10.1016/J.RESPOL.2015.06.014,internal or external spillovers which kind of ...,2016-02-01,45,1,27,41,,j
189762,1923918141,2008.0,,,,powering our lives sustainable energy manageme...,2008-01-01,,,,,,
363717,2027366007,2009.0,67638930.0,,10.1016/J.STRUECO.2008.10.001,multilevel assessment of diversity innovation ...,2009-03-01,20,1,50,60,,j


In [46]:
# we are going to make the co-citation network between all of the publications that cited our focus publication
comentionnet, word2int = pyscisci.coword_network(pub, text_column='Title', stop_words= 'english', 
                                                strip_accents='ascii', lowercase=True, threshold=5, 
                                                vocabulary=None, show_progress=False)

# elminate self-loops
comentionnet.setdiag(0)

comentionnet.data[comentionnet.data <=1] = 0
comentionnet.eliminate_zeros()

# comentionnet is our network adjcency matrix
# word2int maps the words to the row/col of the adjcency matrix
print("{0} nodes, and {1} edges".format(len(word2int), comentionnet.nnz))




144 nodes, and 1494 edges


In [48]:
# lets make a fancy visualization using pyviz
#from pyvis.network import Network

g = nx.from_scipy_sparse_array(comentionnet, edge_attribute='weight')
g = nx.relabel_nodes(g, {i:w for w,i in word2int.items()})
nx.set_edge_attributes(g, {e:int(w) for e,w in nx.get_edge_attributes(g, 'weight').items()}, "weight")

net = Network(notebook=True)
net.repulsion()
net.from_nx(g)
net.show("CoWordMention.html")

Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
