In [1]:
%load_ext autoreload
%autoreload 1

import sys
sys.path.append("../../utils/")

import matplotlib.pyplot as plt
%matplotlib inline
import pdb
import requests
import re

import pandas as pd

import networkx as nx

import signal

import warnings
warnings.filterwarnings("ignore")

from wiki_intro_scrapper import WikiIntroScrapper

%aimport wiki_intro_scrapper

In [47]:
class GraphCreator:

    def __init__(self, entry):
        self.graph = nx.DiGraph()

        self.get_article_info(entry)
        self.seen = {*entry}

        # setup timeout function
        def handle_alarm(signum, frame):
            raise RuntimeError

        signal.signal(signal.SIGALRM, handle_alarm)

    def has_key(self, key, resp, pageid):
        return bool(resp['query']["pages"][pageid].get(key))

    def add_edges(self):
        for article_info in self.results:
            self.graph.add_edges_from([(article_info['title'], link)
                                       for link in article_info['links']])
            self.graph.add_edges_from(
                [(linkhere, article_info['title']) for linkhere in article_info['linkshere']])

    def plot_graph(self):
        nx.draw(self.graph)
        plt.show()

    def get_degrees(self):
        return sorted([(key, val) for key, val in dict(self.graph.degree()).items()], key=lambda x: x[1], reverse=True)

    def get_centrality(self, sort_results=False):
        if sort_results:
            return sorted([(key, val) for key, val in dict(nx.eigenvector_centrality(self.graph)).items()], key=lambda x: x[1], reverse=True)
        else:
            return nx.eigenvector_centrality(self.graph)

    def expand_network(self):
        # update the next_links, and reset results
        self.update_next_links()
        self.results = []
        
        num_links = len(self.next_links)

        for i in range(num_links):
            link = self.next_links.pop(0)
            if not link in self.seen:
                try:
                    signal.alarm(5)
                    print(i, link)
                    self.seen.add(link)
                    self.next_links += self.get_article_info(link)
                    signal.alarm(0)
                except:
                    continue
        signal.alarm(0)

    def insert_results(self, pageid, key, value):
        for d in self.results:
            if d['pageid'] == int(pageid):
                if key in d and type(d.get(key)) == list:
                    d[key] += value
                else:
                    d[key] = value
                break
    
    def update_next_links(self):
        self.next_links = []
        for article in self.results:
            for link in article['links']:
                self.next_links.append(link)
            
        
    def get_article_info(self, title, generate_graph=True):
        
        self.results = []
        
        if type(title) == list:
            title = "|".join(title)
            
        params = {
            "action": "query",
            "format": "json",

            "titles": title,

            # "prop": "extracts|redirects|links|linkshere|categories",
            "prop": "redirects|links|linkshere|categories",

            # extracts
            "exintro": True,
            "explaintext": True,
            "exsectionformat": "plain",

            # redirects
            "rdnamespace": 0,
            "rdlimit": "max",

            # links
            "pllimit": "max",
            "plnamespace": 0,

            # linkshere
            "lhlimit": "max",
            "lhnamespace": 0,
            "lhshow": "!redirect",

            # categories
            "cllimit": "max",

            # automatic redirect
            "redirects": 1
        }

        def query_info(title, params):
            
            resp = requests.get(
                url="https://en.wikipedia.org/w/api.php",
                params=params).json()

            pageids = list(resp["query"]['pages'].keys())
            
            self.results += [{"pageid":int(pid)} for pid in pageids]
            
            for pageid in pageids:
                print(resp['query']["pages"][pageid].keys())
                if self.has_key("title", resp, pageid):                    
                    current_title = resp['query']["pages"][pageid]['title'] 
                    self.insert_results(pageid, "title", current_title)
                else:
                    self.insert_results(pageid, "title", None)
                    
#                 if self.has_key("extract", resp, pageid):
#                     extract.append(resp['query']["pages"][pageid]['extract'])
#                 else:
#                     se
                    
                if self.has_key("redirects", resp, pageid):                    
                    current_rds = [rd["title"] for rd in resp['query']["pages"][pageid]["redirects"]]
                    self.insert_results(pageid, "redirects", current_rds)
                else:
                    self.insert_results(pageid, "redirects", [])
                    
                if self.has_key("links", resp, pageid):
                    current_links = [link['title'] for link in resp['query']["pages"][pageid]["links"]]
                    self.insert_results(pageid, 'links', current_links)
                else:
                    self.insert_results(pageid, 'links', [])
                    
                    
                if self.has_key("linkshere", resp, pageid):
                    current_lh = [lh['title'] for lh in resp['query']["pages"][pageid]["linkshere"]]
                    self.insert_results(pageid, 'linkshere', current_lh)
                else:
                    self.insert_results(pageid, 'linkshere', [])
                
                if self.has_key("categories", resp, pageid):
                    current_cats = []
                    for cat in resp['query']["pages"][pageid]["categories"]:
                        if not bool(re.findall(r"(articles)|(uses)|(commons)", cat["title"], re.I)):
                            current_cats.append(cat["title"])
                    self.insert_results(pageid, 'categories', current_cats)
                else:
                    self.insert_results(pageid, 'categories', [])
                
            
            if resp.get('continue'):
                # remove any previous continue strings
                keys = list(params.keys())
                for key in keys:
                    if "continue" in key:
                        del params[key]
                        
                params.update(resp.get("continue"))
#                 pdb.set_trace()
                query_info(title, params)

        query_info(title, params)
        
        if generate_graph:
            self.add_edges()


## TESTS

In [48]:
test = GraphCreator('Decision tree')

dict_keys(['pageid', 'ns', 'title', 'redirects', 'links', 'linkshere', 'categories'])


In [49]:
test.get_article_info(['Algorithm'])

dict_keys(['pageid', 'ns', 'title', 'redirects', 'links', 'linkshere', 'categories'])
dict_keys(['pageid', 'ns', 'title', 'linkshere'])
dict_keys(['pageid', 'ns', 'title', 'linkshere'])
dict_keys(['pageid', 'ns', 'title', 'linkshere'])
dict_keys(['pageid', 'ns', 'title', 'linkshere'])
dict_keys(['pageid', 'ns', 'title', 'linkshere'])


KeyError: 'links'

In [10]:
test.expand_network()

> <ipython-input-5-ee17619f1ca5>(41)expand_network()
-> self.update_next_links()
(Pdb) len(self.results)
2
(Pdb) len(self.next_links)
0
(Pdb) c


KeyError: 'links'

In [37]:
test.update_next_links()
test.next_links

KeyError: 'links'

In [40]:
test.get_centrality(sort_results=True)


[('Flowchart', 0.29081802618211455),
 ('Topic map', 0.24419895800211164),
 ('Software design', 0.19949239021434867),
 ('Mind map', 0.17198962562328898),
 ('Wicked problem', 0.1647560285613333),
 ('Information design', 0.16342540038341555),
 ('Design rationale', 0.158756021575052),
 ('Design', 0.14069480686412997),
 ('Semantic Web', 0.13642284436223825),
 ('International Standard Book Number', 0.13049220262449715),
 ('Digital object identifier', 0.11702443349125083),
 ('Time', 0.11486189457148767),
 ('Causality', 0.11003201590138505),
 ('Graph drawing', 0.09924857956655686),
 ('Hypertext', 0.09637542793679763),
 ('Semantic network', 0.09635006503818871),
 ('Timeline', 0.09480636758258702),
 ('Algorithm', 0.09380166760861515),
 ('Data visualization', 0.09005095662000749),
 ('Entity–relationship model', 0.08822494566395965),
 ('Knowledge management', 0.08624669111752144),
 ('Infographic', 0.0851998201187487),
 ('Operations research', 0.08473746628017241),
 ('Visual analytics', 0.081466238

In [None]:
test.get_article_info(['Decision tree', 'Cluster analysis', 'Bayes estimator'],generate_graph=False)

In [None]:

gc = GraphCreator("Random forest")
wis = WikiIntroScrapper("https://en.wikipedia.org/wiki/Random_forest")
wis.parse_intro_links();

In [None]:
wis.intro_link_titles

In [None]:
centrality = gc.get_centrality()
print (wis.title, centrality[wis.title])
for title in wis.intro_link_titles:
    print(title, centrality[title])

In [None]:
gc.expand_network()

In [None]:
centrality = gc.get_centrality()

rankings = []

rankings.append((wis.title, centrality[wis.title]))
for title in wis.intro_link_titles:
    rankings.append((title, centrality[title]))
    
ranking_df = pd.DataFrame(rankings, columns=["title", "centrality"])
ranking_df.sort_values("centrality", ascending=False).reset_index().drop('index', axis=1)