In [1]:
%load_ext autoreload
%autoreload 1

import sys
sys.path.append("../../utils/")

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pdb
import requests
import re

import numpy as np
import pandas as pd
from functools import reduce
from collections import Counter

import networkx as nx

import signal

import warnings
warnings.filterwarnings("ignore")

from wiki_intro_scrapper import WikiIntroScrapper
from WikiMultiQuery import wiki_multi_query
from graph_helpers import create_dispersion_df, sort_dict_values, format_categories, compare_categories, rank_order

%aimport wiki_intro_scrapper
%aimport WikiMultiQuery
%aimport graph_helpers

In [55]:
class GraphCreator:

    def __init__(self, entry):
        self.graph = nx.DiGraph()

        self.entry = entry

        wis = WikiIntroScrapper(f"https://en.wikipedia.org/wiki/{entry}")
        wis.parse_intro_links()

        self.intro_nodes = wis.intro_link_titles

        self.visited = {entry}
        self.next_links = []
        
        self.categories = {}
        
        self.redirect_targets = []
        self.redirect_sources = {}
        
        self.query_articles([entry])

        # setup timeout function

        def handle_alarm(signum, frame):
            raise RuntimeError

        signal.signal(signal.SIGALRM, handle_alarm)

    def add_edges(self, articles):
        for article in articles:
            
            self.categories[article['title']] = format_categories([category.split("Category:")[1] for category in article['categories'] if not bool(re.findall(r"(articles)|(uses)|(commons)|(category\:use)", category, re.I))])
            
            self.graph.add_edges_from(
                [(article['title'], link) for link in article['links']])
            self.graph.add_edges_from(
                [(linkhere, article['title']) for linkhere in article['linkshere']])

    def update_edge_weights(self):
        for edge in self.graph.out_edges:
            weight = compare_categories(edge[0], edge[1], self.categories)
            self.graph.add_edge(edge[0], edge[1], weight=weight)
            
        for edge in self.graph.in_edges:
            weight = compare_categories(edge[0], edge[1], self.categories)
            self.graph.add_edge(edge[0], edge[1], weight=weight)
    
    def get_edge_weights(self):
        edge_weights = []
        for edge in self.graph.edges:
            edge_weights.append((edge[0], edge[1], self.graph.get_edge_data(edge[0], edge[1])['weight']))
        
#         for edge in self.graph.in_edges:
#             edge_weights.append((edge[0], edge[1], self.graph.get_edge_data(edge[0], edge[1])['weight']))
        
        return pd.DataFrame(edge_weights, columns=["source_node", "target_node", "edge_weight"]).sort_values("edge_weight", ascending=False).reset_index().drop("index", axis=1)
    
    def plot_graph(self):
        nx.draw(self.graph)
        plt.show()

    def get_shared_categories_with_source(self):
        cat_matches = {}
        for node in self.graph.nodes:
            cat_matches[node] = compare_categories(self.entry, node, self.categories, starting_count=0)
        return sort_dict_values(cat_matches, ['node', 'category_matches_with_source'], 'category_matches_with_source', ascending=False)
            
            
    def get_degrees(self):
        return sort_dict_values(dict(self.graph.degree()), ["node", "degree"], "degree",)

    def get_edges(self):
        in_edges = sort_dict_values(dict(Counter([edge[1] for edge in self.graph.in_edges()])), 
                            ['node', 'in_edges'], "in_edges")
        out_edges = sort_dict_values(dict(Counter([edge[0] for edge in self.graph.out_edges()])), 
                            ["node", 'out_edges'], 'out_edges')

        return in_edges.merge(out_edges, on="node")
    
    def get_centrality(self):
        return sort_dict_values(nx.eigenvector_centrality(self.graph, weight="weight"), ["node", "centrality"], "centrality")

    def get_dispersion(self, comparison_node=None, max_nodes=25_000):
        if not comparison_node:
            comparison_node = self.entry
            
        if max_nodes is None or len(self.graph.nodes) <= max_nodes:
            print("FULL")
            return sort_dict_values(nx.dispersion(self.graph, u=comparison_node), ['node', 'dispersion'], 'dispersion')
        else:
            print("EGO")
            # if the network is too large, perform calculation on ego graph of entry node
            ego = self.create_ego()
            return sort_dict_values(nx.dispersion(ego, u=comparison_node), ['node', 'dispersion'], 'dispersion')

    def get_pageranks(self):
        page_ranks = sorted([(key, value) for key, value in nx.algorithms.link_analysis.pagerank(
            self.graph, weight='weight').items()], key=lambda x: x[1], reverse=True)
        return pd.DataFrame(page_ranks, columns=["node", "page_rank"])

    def get_reciprocity(self):
        return sort_dict_values(nx.algorithms.reciprocity(self.graph, self.graph.nodes), ['node', 'reciprocity'], 'reciprocity')

    def get_adjusted_reciprocity(self):
        r = self.get_reciprocity()
        d = self.get_degrees()

        r_d = r.merge(d, on="node", how="inner")
        r_d['adjusted_reciprocity'] = r_d.reciprocity * r_d.degree

        adjusted_reci = r_d.sort_values("adjusted_reciprocity", ascending=False)
        return adjusted_reci.reset_index().drop(["degree", "reciprocity", "index"], axis=1)
    
    def get_shortes_path(self, source=None, ascending=False):
        if not source:
            source = self.entry
            
        paths = nx.algorithms.single_source_shortest_path_length(self.graph, source)
        return sort_dict_values(paths, ["node", "shortest_path_length_from_source"], "shortest_path_length_from_source", ascending=ascending)
    
    def get_dominator_counts(self, source=None):
        if not source:
            source = self.entry
            
        dom_dict = nx.algorithms.dominance.immediate_dominators(self.graph, start=source)
        
        dom_counts = {}

        for key, value in dom_dict.items():
            if value in dom_counts:
                dom_counts[value] += 1
            else:
                dom_counts[value] = 1
        for node in self.graph.nodes:
            if not node in dom_counts:
                dom_counts[node] = 0
        
        return sort_dict_values(dom_counts, ['node', 'immediate_dominator_count'], 'immediate_dominator_count')
    
    def get_hits(self):
        hits = nx.algorithms.link_analysis.hits_alg.hits(self.graph, max_iter=1000)
        return (sort_dict_values(hits[1], ['node', 'hits_authority'], 'hits_authority')
                .merge(sort_dict_values(hits[0], ['node', 'hits_hub'], 'hits_hub'), on="node"))
    
    def get_features_df(self):
        dfs = []
        
        dfs.append(rank_order(self.get_degrees(), 'degree', ascending=False))
        dfs.append(rank_order(self.get_shared_categories_with_source(), 'category_matches_with_source', ascending=False))
        dfs.append(self.get_edges())
        dfs.append(rank_order(self.get_centrality(), 'centrality', ascending=True))
        dfs.append(rank_order(self.get_dispersion(), "dispersion", ascending=True))
        dfs.append(rank_order(self.get_pageranks(), "page_rank", ascending=False))
        dfs.append(rank_order(self.get_adjusted_reciprocity(), "adjusted_reciprocity", ascending=False))
        dfs.append(rank_order(self.get_shortes_path(), "shortest_path_length_from_source", ascending=True))
        
        return reduce(lambda left, right: pd.merge(left, right, on="node", how="outer"), dfs)
        
    
    def create_ego(self, node=None):
        if not node:
            node = self.entry

        ego = nx.ego_graph(self.graph, node)
        ego.name = node
        return ego

    def expand_network(self, group_size=10, timeout=10):

        num_links = len(self.next_links)

        link_group = []

        for i in range(num_links):
            link = self.next_links.pop(0)
            if not link in self.visited:

                link_group.append(link)

                if len(link_group) == group_size or (i == num_links - 1 and len(link_group) > 0):
                    print("{:.2%}".format(i/num_links))
                    try:
                        signal.alarm(timeout)
                        self.visited.update(link_group)
                        self.query_articles(link_group)
                        signal.alarm(0)
                        link_group = []
                    except:
                        link_group = []
                        continue
        signal.alarm(0)

    def update_redirects(self, articles):
        for article in articles:
            if article.get("redirects"):
                self.redirect_targets.append(article["title"])
                for redirect in article["redirects"]:
                    self.redirect_sources[redirect] = len(self.redirect_targets) - 1
    
    def redraw_redirects(self):
        edges = list(self.graph.edges) # need this copy so 'edges' doesn't change size on iteration
        for edge in edges:
            if edge[0] in self.redirect_sources:
                self.graph.add_edge(self.redirect_targets[self.redirect_sources[edge[0]]], edge[1])
                
            if edge[1] in self.redirect_sources:
                self.graph.add_edge(edge[0], self.redirect_targets[self.redirect_sources[edge[1]]])
        
        self.remove_redirect_nodes()
    
    def remove_redirect_nodes(self):
        nodes = list(self.graph.nodes) # need this copy so 'nodes' doesn't change size on iteration
        for node in nodes:
            if node in self.redirect_sources:
                self.graph.remove_node(node)
    
    def update_next_links(self, articles):
        for article in articles:
            for link in article['links']:
                self.next_links.append(link)

    def query_articles(self, titles, generate_graph=True):
        articles = wiki_multi_query(titles)
        
        self.update_redirects(articles)
        
        self.update_next_links(articles)
        self.add_edges(articles)

## Generating Graph from Entry Point

In [56]:
gc = GraphCreator("Decision tree")

In [57]:
# gc.get_shared_categories_with_source()

In [58]:
len(gc.next_links)

88

In [59]:
gc.expand_network(group_size=2, timeout=5)

1.14%
3.41%
5.68%
7.95%
10.23%
12.50%
14.77%
17.05%
19.32%
21.59%
23.86%
26.14%
28.41%
30.68%
32.95%
35.23%
37.50%
39.77%
42.05%
44.32%
46.59%
48.86%
51.14%
53.41%
55.68%
57.95%
60.23%
62.50%
64.77%
67.05%
69.32%
71.59%
73.86%
76.14%
78.41%
80.68%
82.95%
85.23%
87.50%
89.77%
92.05%
94.32%
96.59%
98.86%


In [60]:
gc.redraw_redirects()

In [61]:
gc.update_edge_weights()

# Feature Set

In [62]:
gc.get_shared_categories_with_source()

Unnamed: 0,node,category_matches_with_source
0,Decision tree,1
1,Random forest,1
2,"Behavior tree (artificial intelligence, roboti...",1
3,Decision tree model,1
4,Information gain in decision trees,1
5,Decision tree learning,1
6,Decision table,1
7,Influence diagram,1
8,ID3 algorithm,1
9,Decision analysis,1


In [63]:
features_df = gc.get_features_df()

FULL


In [68]:
def average_rank(row):
    return np.mean([
        row.degree_ranked,
        row.centrality_ranked,
        row.dispersion_ranked,
        row.page_rank_ranked,
        row.adjusted_reciprocity_ranked,
    ]) * row.category_matches_with_source_ranked * row.shortest_path_length_from_source_ranked 

features_df["rank_average"] = features_df.apply(average_rank, axis=1)

features_df[["node", "rank_average"]].sort_values("rank_average", ascending=True)

Unnamed: 0,node,rank_average
81,Decision tree model,220.4
76,Decision table,232.8
65,ID3 algorithm,245.6
28,Diminishing returns,272.8
67,Topological combinatorics,273.6
75,Information gain in decision trees,282.0
66,Decision cycle,284.8
56,Linearization,292.8
93,"Behavior tree (artificial intelligence, roboti...",306.0
34,Random forest,306.4


In [None]:
# features_df.sort_values("dispersion", ascending=True)
features_df[features_df.node.str.contains("Decision")].sort_values("page_rank", ascending=False)

In [None]:
features_df.dispersion = features_df.dispersion.fillna(0.0)
features_df.shortest_path_length_from_source = features_df.shortest_path_length_from_source.fillna(-1)

In [None]:
sns.pairplot(features_df)

In [None]:
sns.heatmap(features_df.corr())

In [None]:
features_df.info()

In [None]:
len(gc.graph.nodes)

In [None]:
gc.intro_nodes

In [None]:
gc.get_dispersion(max_nodes=100)

In [None]:
len(gc.graph.nodes)


In [None]:
gc.get_degrees().head(25)

In [None]:
gc.get_centrality().head(25)

In [None]:
gc.get_shortes_path().head(25)

In [None]:
gc.get_pageranks().head(25)

In [None]:
gc.get_adjusted_reciprocity().head(25)

In [None]:
gc.get_dominator_counts().head(25)

In [None]:
ego = nx.ego_graph(gc.graph, "Regression analysis", undirected=True)
ego.name = "Regression analysis"
print(nx.info(ego))

In [None]:
len(ego.nodes)

In [None]:
nx.draw(ego)
plt.show()

In [None]:
sorted([(key, value) for key, value in nx.eigenvector_centrality(ego).items()], key=lambda x: x[1], reverse=True)

In [None]:
def sort_dict_values(dict, columns, sort_column, ascending=False):
    to_list = [(key, value) for key, value in dict.items()]
    return pd.DataFrame(to_list, columns=columns).sort_values(sort_column, ascending=ascending).reset_index().drop("index", axis=1)

In [None]:
regression_ego = sort_dict_values(nx.betweenness_centrality(ego), ["node", "betweenness"], "betweenness")
regression_ego
# regression_ego[regression_ego.node == "Random forest"]