In [1]:
%load_ext autoreload
%autoreload 1

import sys
sys.path.append("../../utils/")

import matplotlib.pyplot as plt
%matplotlib inline
import pdb
import requests
import re

import pandas as pd

import networkx as nx

import signal

import warnings
warnings.filterwarnings("ignore")

from wiki_intro_scrapper import WikiIntroScrapper
from WikiMultiQuery import wiki_multi_query
from graph_helpers import create_dispersion_df, sort_dict_values

%aimport wiki_intro_scrapper
%aimport WikiMultiQuery

In [87]:
class GraphCreator:

    def __init__(self, entry):
        self.graph = nx.DiGraph()

        self.entry = entry

        wis = WikiIntroScrapper(f"https://en.wikipedia.org/wiki/{entry}")
        wis.parse_intro_links()

        self.intro_nodes = wis.intro_link_titles

        self.visited = {entry}
        self.next_links = []

        self.query_articles([entry])

        # setup timeout function

        def handle_alarm(signum, frame):
            raise RuntimeError

        signal.signal(signal.SIGALRM, handle_alarm)

    def add_edges(self, articles):
        for article in articles:
            self.graph.add_edges_from(
                [(article['title'], link) for link in article['links']])
            self.graph.add_edges_from(
                [(linkhere, article['title']) for linkhere in article['linkshere']])

    def plot_graph(self):
        nx.draw(self.graph)
        plt.show()

    def get_degrees(self):
        return sort_dict_values(dict(self.graph.degree()), ["node", "degree"], "degree",)

    def get_centrality(self):
        return sort_dict_values(nx.eigenvector_centrality(self.graph), ["node", "centrality"], "centrality")

    def get_dispersion(self, comparison_node=None):
        if not comparison_node:
            comparison_node = self.entry
        return sort_dict_values(nx.dispersion(self.graph, u=comparison_node), ['node', 'dispersion'], 'dispersion')

    def get_pageranks(self):
        page_ranks = sorted([(key, value) for key, value in nx.algorithms.link_analysis.pagerank(
            self.graph).items()], key=lambda x: x[1], reverse=True)
        return pd.DataFrame(page_ranks, columns=["node", "page_rank"])

    def get_reciprocity(self):
        return sort_dict_values(nx.algorithms.reciprocity(self.graph, self.graph.nodes), ['node', 'reciprocity'], 'reciprocity')

    def get_adjusted_reciprocity(self):
        r = self.get_reciprocity()
        d = self.get_degrees()

        r_d = r.merge(d, on="node", how="inner")
        r_d['adjusted_reciprocity'] = r_d.reciprocity * r_d.degree

        adjusted_reci = r_d.sort_values("adjusted_reciprocity", ascending=False)
        return adjusted_reci.reset_index().drop(["degree", "reciprocity", "index"], axis=1)
    
    def get_shortes_path(self, source=None, ascending=False):
        if not source:
            source = self.entry
            
        paths = nx.algorithms.single_source_shortest_path_length(self.graph, source)
        return sort_dict_values(paths, ["node", "shortest_path_length_from_source"], "shortest_path_length_from_source", ascending=ascending)
    
    def get_dominator_counts(self, source=None):
        if not source:
            source = self.entry
            
        dom_dict = nx.algorithms.dominance.immediate_dominators(self.graph, start=source)
        
        dom_counts = {}

        for key, value in dom_dict.items():
            if value in dom_counts:
                dom_counts[value] += 1
            else:
                dom_counts[value] = 1
        for node in self.graph.nodes:
            if not node in dom_counts:
                dom_counts[node] = 0
        
        return sort_dict_values(dom_counts, ['node', 'immediate_dominator_count'], 'immediate_dominator_count')
    
    def create_ego(self, node=None):
        if not node:
            node = self.entry

        ego = nx.ego_graph(self.graph, node)
        ego.name = node
        return ego

    def expand_network(self, group_size=10, timeout=10):

        num_links = len(self.next_links)

        link_group = []

        for i in range(num_links):
            link = self.next_links.pop(0)
            if not link in self.visited:

                link_group.append(link)

                if len(link_group) == group_size or (i == num_links - 1 and len(link_group) > 0):
                    print("{:.2%}".format(i/num_links))
                    try:
                        signal.alarm(timeout)
                        self.visited.update(link_group)
                        self.query_articles(link_group)
                        signal.alarm(0)
                        link_group = []
                    except:
                        link_group = []
                        continue
        signal.alarm(0)

    def update_next_links(self, articles):
        for article in articles:
            for link in article['links']:
                self.next_links.append(link)

    def query_articles(self, titles, generate_graph=True):
        articles = wiki_multi_query(titles)

        self.update_next_links(articles)
        self.add_edges(articles)

## TESTS

In [90]:
gc = GraphCreator("Decision tree")

In [91]:
gc.expand_network(group_size=2, timeout=5)

1.14%
3.41%
5.68%
7.95%
10.23%
12.50%
14.77%
17.05%
19.32%
21.59%
23.86%
26.14%
28.41%
30.68%
32.95%
35.23%
37.50%
39.77%
42.05%
44.32%
46.59%
48.86%
51.14%
53.41%
55.68%
57.95%
60.23%
62.50%
64.77%
67.05%
69.32%
71.59%
73.86%
76.14%
78.41%
80.68%
82.95%
85.23%
87.50%
89.77%
92.05%
94.32%
96.59%
98.86%


In [92]:
len(gc.graph.nodes)

19751

In [93]:
gc.get_degrees().head(25)

Unnamed: 0,node,degree
0,Algorithm,3377
1,Time,2899
2,Causality,2281
3,Probability,1866
4,Operations research,1853
5,Semantic Web,1387
6,Flowchart,1289
7,Topic map,1232
8,Utility,1221
9,Expected value,969


In [94]:
gc.get_centrality().head(25)

Unnamed: 0,node,centrality
0,Topic map,0.324204
1,Flowchart,0.270651
2,Time,0.216061
3,Causality,0.196146
4,Timeline,0.162259
5,Semantic Web,0.160591
6,Graph drawing,0.128667
7,Semantic network,0.118826
8,Hypertext,0.117325
9,Mind map,0.111488


In [95]:
gc.get_shortes_path().head(25)

Unnamed: 0,node,shortest_path_length_from_source
0,Trickle,4
1,Garrett Birkhoff,4
2,Correspondence analysis,4
3,Digital Bibliography & Library Project,4
4,Dual (math),4
5,Extension (semantics),4
6,Formal logic,4
7,Galois connection,4
8,Heterogeneous relation,4
9,Lake,4


In [96]:
gc.get_dispersion().head(25)

Unnamed: 0,node,dispersion
0,Cognitive map,5.0
1,Argument map,4.978261
2,Morphological analysis (problem-solving),4.244444
3,Mind map,4.177778
4,Topic map,3.933333
5,Decision tree learning,3.666667
6,Graph drawing,3.545455
7,Mental model,3.521739
8,Hypertext,3.340909
9,Issue tree,3.295455


In [97]:
gc.get_pageranks().head(25)

Unnamed: 0,node,page_rank
0,Algorithm,0.048295
1,Time,0.04063
2,Probability,0.027246
3,Causality,0.026733
4,Operations research,0.021237
5,Semantic Web,0.016562
6,Expected value,0.014995
7,Topic map,0.013707
8,Flowchart,0.013081
9,Utility,0.011603


In [98]:
gc.get_adjusted_reciprocity().head(25)

Unnamed: 0,node,adjusted_reciprocity
0,Time,1084.0
1,Topic map,1032.0
2,Causality,962.0
3,Semantic Web,764.0
4,Flowchart,720.0
5,Design rationale,488.0
6,Utility,470.0
7,Information design,460.0
8,Timeline,408.0
9,Operations research,336.0


In [99]:
gc.get_dominator_counts().head(25)

Unnamed: 0,node,immediate_dominator_count
0,Decision tree,2680
1,Causality,498
2,Operations management,482
3,Time,447
4,Semantic Web,421
5,Mental model,401
6,Markov chain,337
7,Algorithm,255
8,Operations research,236
9,Entity–relationship model,185


In [None]:
ego = nx.ego_graph(gc.graph, "Regression analysis", undirected=True)
ego.name = "Regression analysis"
print(nx.info(ego))

In [None]:
len(ego.nodes)

In [None]:
nx.draw(ego)
plt.show()

In [None]:
sorted([(key, value) for key, value in nx.eigenvector_centrality(ego).items()], key=lambda x: x[1], reverse=True)

In [None]:
def sort_dict_values(dict, columns, sort_column, ascending=False):
    to_list = [(key, value) for key, value in dict.items()]
    return pd.DataFrame(to_list, columns=columns).sort_values(sort_column, ascending=ascending).reset_index().drop("index", axis=1)

In [None]:
regression_ego = sort_dict_values(nx.betweenness_centrality(ego), ["node", "betweenness"], "betweenness")
regression_ego
# regression_ego[regression_ego.node == "Random forest"]