In [1]:
%load_ext autoreload
%autoreload 1

import sys
sys.path.append("../../utils/")

import matplotlib.pyplot as plt
%matplotlib inline
import pdb
import requests
import re

import pandas as pd

import networkx as nx

import signal

import warnings
warnings.filterwarnings("ignore")

from wiki_intro_scrapper import WikiIntroScrapper
from WikiMultiQuery import wiki_multi_query
from graph_helpers import create_dispersion_df, sort_dict_values

%aimport wiki_intro_scrapper
%aimport WikiMultiQuery

In [113]:
class GraphCreator:

    def __init__(self, entry):
        self.graph = nx.DiGraph()
        
        self.entry = entry
        
        wis = WikiIntroScrapper(f"https://en.wikipedia.org/wiki/{entry}")
        wis.parse_intro_links()
        
        self.intro_nodes = wis.intro_link_titles
        
        self.visited = {entry}
        self.next_links = []
        
        self.query_articles([entry])
        

        # setup timeout function
        def handle_alarm(signum, frame):
            raise RuntimeError

        signal.signal(signal.SIGALRM, handle_alarm)

    def add_edges(self, articles):
        for article in articles:
            self.graph.add_edges_from([(article['title'], link) for link in article['links']])
            self.graph.add_edges_from([(linkhere, article['title']) for linkhere in article['linkshere']])

    def plot_graph(self):
        nx.draw(self.graph)
        plt.show()

    def get_degrees(self):
        return sort_dict_values(dict(self.graph.degree()), ["node", "degrees"], "degrees",)

    def get_centrality(self):
        return sort_dict_values(nx.eigenvector_centrality(self.graph), ["node", "centrality"], "centrality")
    
    def get_dispersion(self, nodes=None, comparison_node=None):        
        if not nodes:
            nodes = self.graph.nodes
        if not comparison_node:
            comparison_node = self.entry
            
        return create_dispersion_df(gc.graph, comparison_node, nodes)
        
    def get_pageranks(self):
        page_ranks = sorted([(key, value) for key, value in nx.algorithms.link_analysis.pagerank(self.graph).items()], key=lambda x: x[1], reverse=True)
        return pd.DataFrame(page_ranks, columns=["node", "page_rank"])
    
    def get_reciprocity(self, nodes=None):
        if not nodes:
            nodes = self.graph.nodes # all the nodes
        return sort_dict_values(nx.algorithms.reciprocity(self.graph, nodes), ['node', 'reciprocity'], 'reciprocity')
    
    def create_ego(self, node=None):
        if not node:
            node = self.entry
            
        ego = nx.ego_graph(self.graph, node)
        ego.name = node
        return ego

    def expand_network(self, group_size=10, timeout=10):
        
        num_links = len(self.next_links)
        
        link_group = []
        
        for i in range(num_links):
            link = self.next_links.pop(0)
            if not link in self.visited:
                
                link_group.append(link)
                
                if len(link_group) == group_size or (i == num_links - 1 and len(link_group) > 0):
                    print("{:.2%}".format(i/num_links))
                    try:
                        signal.alarm(timeout)
                        self.visited.update(link_group)
                        self.query_articles(link_group)
                        signal.alarm(0)
                        link_group = []
                    except:
#                         print("==SKIPPED==")
                        link_group = []
                        continue
        signal.alarm(0)
    
    def update_next_links(self, articles):
        for article in articles:
            for link in article['links']:
                self.next_links.append(link)
            
        
    def query_articles(self, titles, generate_graph=True):            
        articles = wiki_multi_query(titles)
        
        self.update_next_links(articles)
        self.add_edges(articles)


## TESTS

In [114]:
gc = GraphCreator("Overfitting")

In [107]:
gc.expand_network(group_size=2, timeout=5)

1.61%
4.84%
8.06%
11.29%
14.52%
17.74%
20.97%
24.19%
27.42%
30.65%
33.87%
37.10%
40.32%
43.55%
46.77%
50.00%
53.23%
56.45%
59.68%
62.90%
66.13%
69.35%
72.58%
75.81%
79.03%
82.26%
85.48%
88.71%
91.94%
95.16%
98.39%


In [108]:
gc.get_degrees().head(10)

Unnamed: 0,node,degrees
0,HarperCollins,6951
1,Algorithm,3376
2,Machine learning,2610
3,Stony Brook University,1854
4,Occam's razor,1420
5,Linear regression,1293
6,Logistic regression,1222
7,Statistical model,1152
8,Polynomial,1112
9,Statistical inference,1037


In [115]:
gc.get_dispersion().head(10)

Unnamed: 0,entry,node,dispersion
0,Overfitting,Overfitting,30.5
1,Overfitting,Algorithm,0.0
2,Overfitting,American Journal of Epidemiology,0.0
3,Overfitting,Andrew Gelman,0.0
4,Overfitting,Bias–variance tradeoff,0.0
5,Overfitting,BioData Mining,0.0
6,Overfitting,Brian Christian,0.0
7,Overfitting,Cambridge University Press,0.0
8,Overfitting,Causal relation,0.0
9,Overfitting,Coefficient of determination,0.0


In [110]:
gc.get_pageranks().head(10)

Unnamed: 0,node,page_rank
0,HarperCollins,0.120678
1,Algorithm,0.049821
2,Stony Brook University,0.030973
3,Machine learning,0.030712
4,Polynomial,0.017745
5,Occam's razor,0.017272
6,Oxford Dictionaries,0.008957
7,Statistical model,0.008158
8,Linear regression,0.007719
9,Logistic regression,0.006475


In [111]:
reciprocity_df = gc.get_reciprocity(nodes=gc.graph.nodes)
reciprocity_df[reciprocity_df.node == "Overfitting"]

Unnamed: 0,node,reciprocity
1720,Overfitting,0.182796


In [None]:
ego = nx.ego_graph(gc.graph, "Regression analysis", undirected=True)
ego.name = "Regression analysis"
print(nx.info(ego))

In [None]:
len(ego.nodes)

In [None]:
nx.draw(ego)
plt.show()

In [None]:
sorted([(key, value) for key, value in nx.eigenvector_centrality(ego).items()], key=lambda x: x[1], reverse=True)

In [None]:
def sort_dict_values(dict, columns, sort_column, ascending=False):
    to_list = [(key, value) for key, value in dict.items()]
    return pd.DataFrame(to_list, columns=columns).sort_values(sort_column, ascending=ascending).reset_index().drop("index", axis=1)

In [None]:
regression_ego = sort_dict_values(nx.betweenness_centrality(ego), ["node", "betweenness"], "betweenness")
regression_ego
# regression_ego[regression_ego.node == "Random forest"]