In [29]:
%load_ext autoreload
%autoreload 1

import sys
sys.path.append("../../utils/")

import matplotlib.pyplot as plt
%matplotlib inline
import pdb
import requests
import re

import pandas as pd

import networkx as nx

import signal

import warnings
warnings.filterwarnings("ignore")

from wiki_intro_scrapper import WikiIntroScrapper

%aimport wiki_intro_scrapper

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
class GraphCreator:

    def __init__(self, entry):
        self.graph = nx.DiGraph()

        self.next_links = self.get_article_info(entry)['links']
        self.seen = {entry}

        # setup timeout function
        def handle_alarm(signum, frame):
            raise RuntimeError

        signal.signal(signal.SIGALRM, handle_alarm)

    def has_key(self, key, resp, pageid):
        return bool(resp['query']["pages"][pageid].get(key))

    def add_edges(self, article_info):
        self.graph.add_edges_from([(article_info['title'], link)
                                   for link in article_info['links']])
        self.graph.add_edges_from(
            [(linkhere, article_info['title']) for linkhere in article_info['linkshere']])

    def plot_graph(self):
        nx.draw(self.graph)
        plt.show()

    def get_degrees(self):
        return sorted([(key, val) for key, val in dict(self.graph.degree()).items()], key=lambda x: x[1], reverse=True)

    def get_centrality(self, sort_results=False):
        if sort_results:
            return sorted([(key, val) for key, val in dict(nx.eigenvector_centrality(self.graph)).items()], key=lambda x: x[1], reverse=True)
        else:
            return nx.eigenvector_centrality(self.graph)

    def expand_network(self):
        num_links = len(self.next_links)
        for i in range(num_links):
            link = self.next_links.pop(0)
            if not link in self.seen:
                try:
                    signal.alarm(3)
                    print(i, link)
                    self.seen.add(link)
                    self.next_links += self.get_article_info(link)
                    signal.alarm(0)
                except:
                    continue

    def get_article_info(self, title, generate_graph=True):
        params = {
            "action": "query",
            "format": "json",

            "titles": title,

            "prop": "extracts|redirects|links|linkshere|categories",

            # extracts
            "exintro": True,
            "explaintext": True,
            "exsectionformat": "plain",

            # redirects
            "rdnamespace": 0,
            "rdlimit": "max",

            # links
            "pllimit": "max",
            "plnamespace": 0,

            # linkshere
            "lhlimit": "max",
            "lhnamespace": 0,
            "lhshow": "!redirect",

            # categories
            "cllimit": "max",

            # automatic redirect
            "redirects": 1
        }

        article_id = []
        extract = []
        redirects = []
        links = []
        linkshere = []
        categories = []

        def query_info(title, params):

            resp = requests.get(
                url="https://en.wikipedia.org/w/api.php",
                params=params).json()

#             pdb.set_trace()

            pageid = list(resp["query"]['pages'].keys())[0]

            article_id.append(pageid)

            if self.has_key("extract", resp, pageid):
                extract.append(resp['query']["pages"][pageid]['extract'])

            if self.has_key("redirects", resp, pageid):
                for rd in resp['query']["pages"][pageid]["redirects"]:
                    redirects.append(rd["title"])

            if self.has_key("links", resp, pageid):
                for link in resp['query']["pages"][pageid]["links"]:
                    links.append(link["title"])

            if self.has_key("linkshere", resp, pageid):
                for lh in resp['query']["pages"][pageid]["linkshere"]:
                    linkshere.append(lh["title"])

            if self.has_key("categories", resp, pageid):
                for cat in resp['query']["pages"][pageid]["categories"]:
                    if not bool(re.findall(r"(articles)|(uses)|(commons)", cat["title"], re.I)):
                        categories.append(cat["title"])

            if resp.get('continue'):
                params.update(resp.get("continue"))
                query_info(title, params)

        query_info(title, params)

        article_info = {
            "pageid": int(article_id[0]),
            "title": title,
            "extract": extract,
            "redirects": redirects,
            "links": links,
            "linkshere": linkshere,
            "categories": categories,
        }

        if generate_graph:
            self.add_edges(article_info)

        return article_info

## TESTS

In [23]:

gc = GraphCreator("Random forest")
wis = WikiIntroScrapper("https://en.wikipedia.org/wiki/Random_forest")
wis.parse_intro_links();

In [24]:
wis.intro_link_titles

['Ensemble learning',
 'Statistical classification',
 'Regression analysis',
 'Decision tree learning',
 'Mode (statistics)',
 'Overfitting',
 'Test set',
 'Tin Kam Ho',
 'Random subspace method',
 'Leo Breiman',
 'Trademark',
 'Minitab',
 'Bootstrap aggregating',
 'Donald Geman']

In [25]:
centrality = gc.get_centrality()
print (wis.title, centrality[wis.title])
for title in wis.intro_link_titles:
    print(title, centrality[title])

Random forest 0.6037188521312119
Ensemble learning 0.06835910518451588
Statistical classification 0.06835910518451588
Regression analysis 0.06835910518451588
Decision tree learning 0.06835910518451588
Mode (statistics) 0.06835910518451588
Overfitting 0.06835910518451588
Test set 0.06835910518451588
Tin Kam Ho 0.06835910518451588
Random subspace method 0.06835910518451588
Leo Breiman 0.06835910518451588
Trademark 0.06835910518451588
Minitab 0.06835910518451588
Bootstrap aggregating 0.06835910518451588
Donald Geman 0.06835910518451588


In [26]:
gc.expand_network()

0 Annals of Mathematics and Artificial Intelligence
1 Annals of Statistics
2 Anomaly detection
3 ArXiv
4 Artificial neural network
5 Artificial neural networks
6 Association rule learning
7 Autoencoder
8 Automated machine learning
9 BIRCH
10 Bayesian network
11 Bias–variance dilemma
12 Bias–variance tradeoff
13 Boosting (machine learning)
14 Bootstrap aggregating
15 CURE data clustering algorithm
16 Canonical correlation analysis
17 CiteSeerX
18 Classification and regression tree
19 Cluster analysis
20 Computational learning theory
21 Conditional random field
22 Conference on Neural Information Processing Systems
23 Convolutional neural network
24 Correlation
25 Cross-validation (statistics)
26 DBSCAN
27 Data mining
28 Decision tree
29 Decision tree learning
30 DeepDream
31 Deep learning
32 Digital object identifier
33 Dimensionality reduction
34 Donald Geman
35 Empirical risk minimization
36 Ensemble learning
37 Expectation–maximization algorithm
38 Factor analysis
39 Feature (machine

In [40]:
centrality = gc.get_centrality()

rankings = []

rankings.append((wis.title, centrality[wis.title]))
for title in wis.intro_link_titles:
    rankings.append((title, centrality[title]))
    
ranking_df = pd.DataFrame(rankings, columns=["title", "centrality"])
ranking_df.sort_values("centrality", ascending=False).reset_index().drop('index', axis=1)

Unnamed: 0,title,centrality
0,Regression analysis,0.158336
1,Statistical classification,0.149279
2,Decision tree learning,0.083674
3,Bootstrap aggregating,0.083298
4,Ensemble learning,0.082659
5,Random forest,0.081253
6,Mode (statistics),0.081252
7,Overfitting,0.027596
8,Leo Breiman,0.010554
9,Test set,0.007041
