In [1]:
%load_ext autoreload
%autoreload 1

import sys
sys.path.append("../../utils/")

import matplotlib.pyplot as plt
%matplotlib inline
import pdb
import requests
import re

import pandas as pd

import networkx as nx

import signal

import warnings
warnings.filterwarnings("ignore")

from wiki_intro_scrapper import WikiIntroScrapper
from WikiMultiQuery import wiki_multi_query

%aimport wiki_intro_scrapper
%aimport WikiMultiQuery

In [8]:
wiki_multi_query(["PubMed Identifier"])

[{'pageid': '503009',
  'title': 'PubMed',
  'links': ['Aalborg University',
   'Academic library',
   'Admission note',
   'Annals of Surgery',
   "Anne O'Tate",
   'Arrowsmith System',
   'BMJ',
   'Bibcode',
   'Bibliographic database',
   'Blue Button',
   'Correction (newspaper)',
   'De-identification',
   'Decision aids',
   'Dialog (online database)',
   'Digital object identifier',
   'Doctor–patient relationship',
   'E-patient',
   'EBSCO Publishing',
   'EHealth',
   'ETBLAST',
   'Electronic health record',
   'Embase',
   'Entrez',
   'Health 2.0',
   'Health Insurance Portability and Accountability Act',
   'Health education',
   'Health informatics',
   'Health information on Wikipedia',
   'Health information on the Internet',
   'In absentia health care',
   'Index Medicus',
   'Information retrieval',
   'JournalReview.org',
   'Kenneth H. Wolfe',
   'Knowledge Finder',
   'Knowledge translation',
   'Letter to the editor',
   'List of open-source health software',
 

In [3]:
class GraphCreator:

    def __init__(self, entry):
        self.graph = nx.DiGraph()

        self.visited = {entry}
        self.next_links = []
        
        self.query_articles([entry])
        

        # setup timeout function
        def handle_alarm(signum, frame):
            raise RuntimeError

        signal.signal(signal.SIGALRM, handle_alarm)

    def has_key(self, key, resp, pageid):
        return bool(resp['query']["pages"][pageid].get(key))

    def add_edges(self, articles):
        for article in articles:
            self.graph.add_edges_from([(article['title'], link) for link in article['links']])
            self.graph.add_edges_from([(linkhere, article['title']) for linkhere in article['linkshere']])

    def plot_graph(self):
        nx.draw(self.graph)
        plt.show()

    def get_degrees(self):
        return sorted([(key, val) for key, val in dict(self.graph.degree()).items()], key=lambda x: x[1], reverse=True)

    def get_centrality(self, sort_results=False):
        if sort_results:
            return sorted([(key, val) for key, val in dict(nx.eigenvector_centrality(self.graph)).items()], key=lambda x: x[1], reverse=True)
        else:
            return nx.eigenvector_centrality(self.graph)

    def expand_network(self):
        # next step is to make it so each .query_article call works on multiple articles at a time.
        num_links = len(self.next_links)

        for i in range(num_links):
            link = self.next_links.pop(0)
            if not link in self.visited:
                try:
                    signal.alarm(5)
                    print(i, link)
                    self.visited.add(link)
                    self.query_articles([link])
                    signal.alarm(0)
                except:
                    print("==SKIPPED==")
                    continue
        signal.alarm(0)
    
    def update_next_links(self, articles):
        for article in articles:
            for link in article['links']:
                self.next_links.append(link)
            
        
    def query_articles(self, titles, generate_graph=True):            
        articles = wiki_multi_query(titles)
        
        self.update_next_links(articles)
        self.add_edges(articles)


## TESTS

In [4]:
test = GraphCreator('Decision tree')

In [5]:
test.expand_network()

0 Algorithm
1 Argument map
2 Association rule learning
3 Behavior tree (artificial intelligence, robotics and control)
4 Boosting (machine learning)
5 Business decision mapping
6 Causal
7 Causal model
8 CiteSeerX
==SKIPPED==
9 Cladistics
10 Cognitive map
11 Concept lattice
12 Concept map
13 Conceptual graph
14 Conditional probability
15 DRAKON
16 Data visualization
17 Decision analysis
18 Decision cycle
19 Decision list
20 Decision support system
21 Decision table
22 Decision tree learning
23 Decision tree model
24 Dendrogram
25 Design rationale
26 Diagrammatic reasoning
27 Digital object identifier
==SKIPPED==
28 Diminishing returns
29 Entity–relationship model
30 Expected utility
31 Expected value
32 Flowchart
33 Geovisualization
34 Goal
35 Graph drawing
36 Graphic communication
37 Hyperbolic tree
38 Hypertext
39 ID3 algorithm
40 Influence diagram
41 Influence diagrams
42 Infographic
43 Information design
44 Information gain in decision trees
45 Information mapping
46 International S

In [11]:
test.get_centrality(sort_results=True)

[('Topic map', 0.32157174975200986),
 ('Flowchart', 0.268518876167679),
 ('Time', 0.2076387997870216),
 ('Causality', 0.18916096048199302),
 ('Semantic Web', 0.1615223304764255),
 ('Timeline', 0.15908451733340012),
 ('Graph drawing', 0.1310950789431242),
 ('Semantic network', 0.11974144543575406),
 ('Hypertext', 0.11817750890465005),
 ('International Standard Book Number', 0.11631755448356124),
 ('Mind map', 0.11359166866747386),
 ('Data visualization', 0.11139028754944111),
 ('Infographic', 0.10744429907619288),
 ('Digital object identifier', 0.10695199969443064),
 ('Information design', 0.10463064478888211),
 ('Visual analytics', 0.10343726219458092),
 ('Geovisualization', 0.10201819258028276),
 ('Wicked problem', 0.09890768719493147),
 ('Design rationale', 0.09588864988150472),
 ('Algorithm', 0.07565548377866424),
 ('Entity–relationship model', 0.07417138345296499),
 ('Visualization (graphics)', 0.07329370388059653),
 ('Concept map', 0.06743649402732377),
 ('Argument map', 0.0665512

In [None]:
test.expand_network()

In [None]:
test.update_next_links()
test.next_links

In [None]:
test.get_centrality(sort_results=True)


In [None]:
test.get_article_info(['Decision tree', 'Cluster analysis', 'Bayes estimator'],generate_graph=False)

In [None]:

gc = GraphCreator("Random forest")
wis = WikiIntroScrapper("https://en.wikipedia.org/wiki/Random_forest")
wis.parse_intro_links();

In [None]:
wis.intro_link_titles

In [None]:
centrality = gc.get_centrality()
print (wis.title, centrality[wis.title])
for title in wis.intro_link_titles:
    print(title, centrality[title])

In [None]:
gc.expand_network()

In [None]:
centrality = gc.get_centrality()

rankings = []

rankings.append((wis.title, centrality[wis.title]))
for title in wis.intro_link_titles:
    rankings.append((title, centrality[title]))
    
ranking_df = pd.DataFrame(rankings, columns=["title", "centrality"])
ranking_df.sort_values("centrality", ascending=False).reset_index().drop('index', axis=1)