In [2]:
%load_ext autoreload
%autoreload 1

import sys
sys.path.append("../../utils/")

import matplotlib.pyplot as plt
%matplotlib inline
import pdb
import requests
import re

import pandas as pd

import networkx as nx

import signal

import warnings
warnings.filterwarnings("ignore")

from wiki_intro_scrapper import WikiIntroScrapper
from WikiMultiQuery import wiki_multi_query

%aimport wiki_intro_scrapper
%aimport WikiMultiQuery

In [3]:
class GraphCreator:

    def __init__(self, entry):
        self.graph = nx.DiGraph()

        self.visited = {entry}
        self.next_links = []
        
        self.query_articles([entry])
        

        # setup timeout function
        def handle_alarm(signum, frame):
            raise RuntimeError

        signal.signal(signal.SIGALRM, handle_alarm)

    def add_edges(self, articles):
        for article in articles:
            self.graph.add_edges_from([(article['title'], link) for link in article['links']])
            self.graph.add_edges_from([(linkhere, article['title']) for linkhere in article['linkshere']])

    def plot_graph(self):
        nx.draw(self.graph)
        plt.show()

    def get_degrees(self):
        return sorted([(key, val) for key, val in dict(self.graph.degree()).items()], key=lambda x: x[1], reverse=True)

    def get_centrality(self, sort_results=False):
        if sort_results:
            return sorted([(key, val) for key, val in dict(nx.eigenvector_centrality(self.graph)).items()], key=lambda x: x[1], reverse=True)
        else:
            return nx.eigenvector_centrality(self.graph)

    def expand_network(self, group_size=10, timeout=10):
        
        num_links = len(self.next_links)
        
        link_group = []
        
        for i in range(num_links):
            link = self.next_links.pop(0)
            if not link in self.visited:
                
                link_group.append(link)
                
                if len(link_group) == group_size or (i == num_links - 1 and len(link_group) > 0):
#                     print("{:.2%}".format(i/num_links))
                    try:
                        signal.alarm(timeout)
                        self.visited.update(link_group)
                        self.query_articles(link_group)
                        signal.alarm(0)
                        link_group = []
                    except:
                        print("==SKIPPED==")
                        link_group = []
                        continue
        signal.alarm(0)
    
    def update_next_links(self, articles):
        for article in articles:
            for link in article['links']:
                self.next_links.append(link)
            
        
    def query_articles(self, titles, generate_graph=True):            
        articles = wiki_multi_query(titles)
        
        self.update_next_links(articles)
        self.add_edges(articles)


## TESTS

In [18]:
gc = GraphCreator("Random forest")
wis = WikiIntroScrapper("https://en.wikipedia.org/wiki/Random_forest")
wis.parse_intro_links();

In [19]:
wis.intro_link_titles

['Ensemble learning',
 'Statistical classification',
 'Regression analysis',
 'Decision tree learning',
 'Mode (statistics)',
 'Overfitting',
 'Test set',
 'Tin Kam Ho',
 'Random subspace method',
 'Leo Breiman',
 'Trademark',
 'Minitab',
 'Bootstrap aggregating',
 'Donald Geman']

In [20]:
centrality = gc.get_centrality()
print (wis.title, centrality[wis.title])
for title in wis.intro_link_titles:
    print(title, centrality[title])

Random forest 0.6037188521312119
Ensemble learning 0.06835910518451588
Statistical classification 0.06835910518451588
Regression analysis 0.06835910518451588
Decision tree learning 0.06835910518451588
Mode (statistics) 0.06835910518451588
Overfitting 0.06835910518451588
Test set 0.06835910518451588
Tin Kam Ho 0.06835910518451588
Random subspace method 0.06835910518451588
Leo Breiman 0.06835910518451588
Trademark 0.06835910518451588
Minitab 0.06835910518451588
Bootstrap aggregating 0.06835910518451588
Donald Geman 0.06835910518451588


In [21]:
gc.expand_network(group_size=5, timeout=20)

==SKIPPED==
==SKIPPED==
==SKIPPED==
==SKIPPED==
==SKIPPED==


In [22]:
centrality = gc.get_centrality()

rankings = []

rankings.append((wis.title, centrality[wis.title]))
for title in wis.intro_link_titles:
    rankings.append((title, centrality[title]))
    
ranking_df = pd.DataFrame(rankings, columns=["title", "centrality"])
ranking_df.sort_values("centrality", ascending=False).reset_index().drop('index', axis=1)

Unnamed: 0,title,centrality
0,Regression analysis,0.175859
1,Statistical classification,0.16398
2,Mode (statistics),0.099105
3,Decision tree learning,0.081395
4,Bootstrap aggregating,0.079864
5,Random forest,0.078881
6,Ensemble learning,0.078719
7,Overfitting,0.026219
8,Leo Breiman,0.007834
9,Test set,0.004663


In [23]:
nx.dispersion(gc.graph, u="Decision tree", v="Probability")

0.0

In [24]:
dispersion_df = pd.DataFrame([("Decision tree", node, nx.dispersion(gc.graph, "Decision tree", node)) for node in ranking_df.title], columns=["node1", "node2", "dispersion"])
dispersion_df.sort_values("dispersion")

Unnamed: 0,node1,node2,dispersion
7,Decision tree,Test set,0.0
14,Decision tree,Donald Geman,0.0
13,Decision tree,Bootstrap aggregating,0.166667
1,Decision tree,Ensemble learning,0.285714
5,Decision tree,Mode (statistics),0.5
11,Decision tree,Trademark,0.5
12,Decision tree,Minitab,0.5
8,Decision tree,Tin Kam Ho,0.666667
10,Decision tree,Leo Breiman,0.666667
3,Decision tree,Regression analysis,1.0


In [16]:
nx.degree_pearson_correlation_coefficient(gc.graph, nodes=list(ranking_df.title))

0.23983522533074186