In [1]:
%load_ext autoreload
%autoreload 1

import sys
sys.path.append("../../utils/")

import matplotlib.pyplot as plt
%matplotlib inline
import pdb
import requests
import re

import pandas as pd

import networkx as nx

import signal

import warnings
warnings.filterwarnings("ignore")

from wiki_intro_scrapper import WikiIntroScrapper
from WikiMultiQuery import wiki_multi_query
from graph_helpers import create_dispersion_df, sort_dict_values

%aimport wiki_intro_scrapper
%aimport WikiMultiQuery

In [186]:
class GraphCreator:

    def __init__(self, entry):
        self.graph = nx.DiGraph()

        self.entry = entry

        wis = WikiIntroScrapper(f"https://en.wikipedia.org/wiki/{entry}")
        wis.parse_intro_links()

        self.intro_nodes = wis.intro_link_titles

        self.visited = {entry}
        self.next_links = []

        self.query_articles([entry])

        # setup timeout function

        def handle_alarm(signum, frame):
            raise RuntimeError

        signal.signal(signal.SIGALRM, handle_alarm)

    def add_edges(self, articles):
        for article in articles:
            self.graph.add_edges_from(
                [(article['title'], link) for link in article['links']])
            self.graph.add_edges_from(
                [(linkhere, article['title']) for linkhere in article['linkshere']])

    def plot_graph(self):
        nx.draw(self.graph)
        plt.show()

    def get_degrees(self):
        return sort_dict_values(dict(self.graph.degree()), ["node", "degree"], "degree",)

    def get_centrality(self):
        return sort_dict_values(nx.eigenvector_centrality(self.graph), ["node", "centrality"], "centrality")

    def get_dispersion(self, comparison_node=None, max_nodes=25_000):
        if not comparison_node:
            comparison_node = self.entry
            
        if max_nodes is None or len(self.graph.nodes) <= max_nodes:
            print("FULL")
            return sort_dict_values(nx.dispersion(self.graph, u=comparison_node), ['node', 'dispersion'], 'dispersion')
        else:
            print("EGO")
            # if the network is too large, perform calculation on ego graph of entry node
            ego = self.create_ego()
            return sort_dict_values(nx.dispersion(ego, u=comparison_node), ['node', 'dispersion'], 'dispersion')

    def get_pageranks(self):
        page_ranks = sorted([(key, value) for key, value in nx.algorithms.link_analysis.pagerank(
            self.graph).items()], key=lambda x: x[1], reverse=True)
        return pd.DataFrame(page_ranks, columns=["node", "page_rank"])

    def get_reciprocity(self):
        return sort_dict_values(nx.algorithms.reciprocity(self.graph, self.graph.nodes), ['node', 'reciprocity'], 'reciprocity')

    def get_adjusted_reciprocity(self):
        r = self.get_reciprocity()
        d = self.get_degrees()

        r_d = r.merge(d, on="node", how="inner")
        r_d['adjusted_reciprocity'] = r_d.reciprocity * r_d.degree

        adjusted_reci = r_d.sort_values("adjusted_reciprocity", ascending=False)
        return adjusted_reci.reset_index().drop(["degree", "reciprocity", "index"], axis=1)
    
    def get_shortes_path(self, source=None, ascending=False):
        if not source:
            source = self.entry
            
        paths = nx.algorithms.single_source_shortest_path_length(self.graph, source)
        return sort_dict_values(paths, ["node", "shortest_path_length_from_source"], "shortest_path_length_from_source", ascending=ascending)
    
    def get_dominator_counts(self, source=None):
        if not source:
            source = self.entry
            
        dom_dict = nx.algorithms.dominance.immediate_dominators(self.graph, start=source)
        
        dom_counts = {}

        for key, value in dom_dict.items():
            if value in dom_counts:
                dom_counts[value] += 1
            else:
                dom_counts[value] = 1
        for node in self.graph.nodes:
            if not node in dom_counts:
                dom_counts[node] = 0
        
        return sort_dict_values(dom_counts, ['node', 'immediate_dominator_count'], 'immediate_dominator_count')
    
    def create_ego(self, node=None):
        if not node:
            node = self.entry

        ego = nx.ego_graph(self.graph, node)
        ego.name = node
        return ego

    def expand_network(self, group_size=10, timeout=10):

        num_links = len(self.next_links)

        link_group = []

        for i in range(num_links):
            link = self.next_links.pop(0)
            if not link in self.visited:

                link_group.append(link)

                if len(link_group) == group_size or (i == num_links - 1 and len(link_group) > 0):
                    print("{:.2%}".format(i/num_links))
                    try:
                        signal.alarm(timeout)
                        self.visited.update(link_group)
                        self.query_articles(link_group)
                        signal.alarm(0)
                        link_group = []
                    except:
                        link_group = []
                        continue
        signal.alarm(0)

    def update_next_links(self, articles):
        for article in articles:
            for link in article['links']:
                self.next_links.append(link)

    def query_articles(self, titles, generate_graph=True):
        articles = wiki_multi_query(titles)

        self.update_next_links(articles)
        self.add_edges(articles)

## TESTS

In [192]:
gc = GraphCreator("Linear regression")

In [193]:
len(gc.next_links)

489

In [194]:
gc.expand_network(group_size=2, timeout=5)

0.20%
0.61%
1.02%
1.43%
1.84%
2.25%
2.66%
3.07%
3.48%
3.89%
4.29%
4.70%
5.11%
5.52%
5.93%
6.34%
6.75%
7.16%
7.57%
7.98%
8.38%
8.79%
9.20%
9.61%
10.02%
10.43%
10.84%
11.25%
11.66%
12.07%
12.47%
12.88%
13.29%
13.70%
14.11%
14.52%
14.93%
15.34%
15.75%
16.16%
16.56%
16.97%
17.38%
17.79%
18.20%
18.61%
19.02%
19.43%
19.84%
20.25%
20.65%
21.06%
21.47%
21.88%
22.29%
22.70%
23.11%
23.52%
23.93%
24.34%
24.74%
25.15%
25.56%
25.97%
26.38%
26.79%
27.20%
27.61%
28.02%
28.43%
28.83%
29.24%
29.65%
30.06%
30.47%
30.88%
31.29%
31.70%
32.11%
32.52%
32.92%
33.33%
33.74%
34.15%
34.56%
34.97%
35.38%
35.79%
36.20%
36.61%
37.01%
37.42%
37.83%
38.24%
38.65%
39.06%
39.47%
39.88%
40.29%
40.70%
41.10%
41.51%
41.92%
42.33%
42.74%
43.15%
43.56%
43.97%
44.38%
44.79%
45.19%
45.60%
46.01%
46.42%
46.83%
47.24%
47.65%
48.06%
48.47%
48.88%
49.28%
49.69%
50.10%
50.51%
50.92%
51.33%
51.74%
52.15%
52.56%
52.97%
53.37%
53.78%
54.19%
54.60%
55.01%
55.42%
55.83%
56.24%
56.65%
57.06%
57.46%
57.87%
58.28%
58.69%
59.10%
59.51%
59

In [195]:
gc.get_dispersion(max_nodes=25_000)

EGO


Unnamed: 0,node,dispersion
0,Polynomial regression,27.698895
1,Least squares,27.400000
2,Regression analysis,26.691689
3,Nonlinear regression,25.674221
4,Logistic regression,25.563889
5,Bayesian linear regression,24.808451
6,Analysis of variance,24.224189
7,Robust regression,23.688385
8,List of statistics articles,23.623342
9,Structural equation modeling,23.294294


In [117]:
len(gc.graph.nodes)

69421

In [118]:
gc.get_degrees().head(25)

Unnamed: 0,node,degree
0,Mathematical Reviews,7626
1,Clinical trial,6917
2,Artificial intelligence,6717
3,Charles Darwin,5324
4,Statistics,4941
5,Epidemiology,3371
6,List of statistics articles,3264
7,Bioinformatics,2809
8,Machine learning,2610
9,Cartography,2376


In [119]:
gc.get_centrality().head(25)

Unnamed: 0,node,centrality
0,Statistics,0.067949
1,Regression analysis,0.062837
2,Probability distribution,0.062091
3,Variance,0.061433
4,Bayesian linear regression,0.060931
5,Linear regression,0.060685
6,Logistic regression,0.060681
7,Statistical model,0.060623
8,Ordinary least squares,0.06039
9,Econometrics,0.059945


In [120]:
gc.get_shortes_path().head(25)

Unnamed: 0,node,shortest_path_length_from_source
0,The Stationery Office,4
1,Differential diagnosis,4
2,Disability-adjusted life year,4
3,Disease (disambiguation),4
4,Disease burden,4
5,Disease mongering,4
6,Diseases of affluence,4
7,Diseases of poverty,4
8,Disseminated disease,4
9,Distress (medicine),4


In [161]:
dis = gc.get_dispersion()
deg = gc.get_degrees()

merged = deg.merge(dis, on="node", how="inner")
merged.sort_values("node")


Unnamed: 0,node,degree,dispersion
0,Algorithm,3377,0.833333
33,Argument map,277,4.978261
34,Association rule learning,272,2.142857
75,"Behavior tree (artificial intelligence, roboti...",25,0.000000
32,Boosting (machine learning),285,1.333333
58,Business decision mapping,94,2.880952
86,Causal,1,0.000000
59,Causal model,89,1.600000
83,CiteSeerX,13,0.000000
78,Cladistics,21,0.000000


In [159]:
ego = gc.create_ego()
nx.dispersion(ego, "Decision tree")

{'Algorithm': 0.8333333333333334,
 'Argument map': 4.978260869565218,
 'Association rule learning': 2.142857142857143,
 'Behavior tree (artificial intelligence, robotics and control)': 0.0,
 'Boosting (machine learning)': 1.3333333333333333,
 'Business decision mapping': 2.880952380952381,
 'Causal': 0.0,
 'Causal model': 1.6,
 'CiteSeerX': 0.0,
 'Cladistics': 0.0,
 'Cognitive map': 5.0,
 'Concept lattice': 0.0,
 'Concept map': 3.1627906976744184,
 'Conceptual graph': 3.2954545454545454,
 'Conditional probability': 0.25,
 'DRAKON': 0.0,
 'Data visualization': 0.4444444444444444,
 'Decision analysis': 2.4444444444444446,
 'Decision cycle': 0.0,
 'Decision list': 0.0,
 'Decision support system': 1.5,
 'Decision table': 0.0,
 'Decision tree learning': 3.6666666666666665,
 'Decision tree model': 0.0,
 'Dendrogram': 1.6,
 'Design rationale': 3.1136363636363638,
 'Diagrammatic reasoning': 0.7027027027027027,
 'Digital object identifier': 0.0,
 'Diminishing returns': 0.6666666666666666,
 'Ent

In [122]:
gc.get_pageranks().head(25)

Unnamed: 0,node,page_rank
0,Mathematical Reviews,0.029393
1,Charles Darwin,0.025593
2,Artificial intelligence,0.022372
3,Clinical trial,0.021576
4,Statistics,0.013144
5,Disease,0.008233
6,Epidemiology,0.007922
7,Bioinformatics,0.007318
8,Cartography,0.006375
9,Machine learning,0.00607


In [123]:
gc.get_adjusted_reciprocity().head(25)

Unnamed: 0,node,adjusted_reciprocity
0,Artificial intelligence,1420.0
1,Charles Darwin,1230.0
2,Reliability engineering,1070.0
3,Epidemiology,1062.0
4,Regression analysis,990.0
5,National accounts,956.0
6,Probabilistic design,926.0
7,Probability distribution,920.0
8,Randomized controlled trial,902.0
9,Multivariate normal distribution,884.0


In [124]:
gc.get_dominator_counts().head(25)

Unnamed: 0,node,immediate_dominator_count
0,Linear regression,6994
1,Artificial intelligence,1140
2,List of statistics articles,954
3,Charles Darwin,699
4,Tobacco smoking,630
5,Carl Friedrich Gauss,411
6,Bioinformatics,345
7,Paul Krugman,341
8,Reliability engineering,317
9,Disease,280


In [None]:
ego = nx.ego_graph(gc.graph, "Regression analysis", undirected=True)
ego.name = "Regression analysis"
print(nx.info(ego))

In [None]:
len(ego.nodes)

In [None]:
nx.draw(ego)
plt.show()

In [None]:
sorted([(key, value) for key, value in nx.eigenvector_centrality(ego).items()], key=lambda x: x[1], reverse=True)

In [None]:
def sort_dict_values(dict, columns, sort_column, ascending=False):
    to_list = [(key, value) for key, value in dict.items()]
    return pd.DataFrame(to_list, columns=columns).sort_values(sort_column, ascending=ascending).reset_index().drop("index", axis=1)

In [None]:
regression_ego = sort_dict_values(nx.betweenness_centrality(ego), ["node", "betweenness"], "betweenness")
regression_ego
# regression_ego[regression_ego.node == "Random forest"]