In [1]:
%load_ext autoreload
%autoreload 1

import sys
sys.path.append("../../utils/")

import matplotlib.pyplot as plt
%matplotlib inline
import pdb
import requests
import re

import pandas as pd

import networkx as nx

import signal

import warnings
warnings.filterwarnings("ignore")

from wiki_intro_scrapper import WikiIntroScrapper
from WikiMultiQuery import wiki_multi_query

%aimport wiki_intro_scrapper
%aimport WikiMultiQuery

In [37]:
class GraphCreator:

    def __init__(self, entry):
        self.graph = nx.DiGraph()

        self.visited = {entry}
        self.next_links = []
        
        self.query_articles([entry])
        

        # setup timeout function
        def handle_alarm(signum, frame):
            raise RuntimeError

        signal.signal(signal.SIGALRM, handle_alarm)

    def add_edges(self, articles):
        for article in articles:
            self.graph.add_edges_from([(article['title'], link) for link in article['links']])
            self.graph.add_edges_from([(linkhere, article['title']) for linkhere in article['linkshere']])

    def plot_graph(self):
        nx.draw(self.graph)
        plt.show()

    def get_degrees(self):
        return sorted([(key, val) for key, val in dict(self.graph.degree()).items()], key=lambda x: x[1], reverse=True)

    def get_centrality(self, sort_results=False):
        if sort_results:
            return sorted([(key, val) for key, val in dict(nx.eigenvector_centrality(self.graph)).items()], key=lambda x: x[1], reverse=True)
        else:
            return nx.eigenvector_centrality(self.graph)

    def expand_network(self, group_size=10, timeout=10):
        
        num_links = len(self.next_links)
        
        link_group = []
        
        for i in range(num_links):
            link = self.next_links.pop(0)
            if not link in self.visited:
                
                link_group.append(link)
                
                if len(link_group) == group_size or (i == num_links - 1 and len(link_group) > 0):
#                     print("{:.2%}".format(i/num_links))
                    try:
                        signal.alarm(timeout)
                        self.visited.update(link_group)
                        self.query_articles(link_group)
                        signal.alarm(0)
                        link_group = []
                    except:
                        print("==SKIPPED==")
                        link_group = []
                        continue
        signal.alarm(0)
    
    def update_next_links(self, articles):
        for article in articles:
            for link in article['links']:
                self.next_links.append(link)
            
        
    def query_articles(self, titles, generate_graph=True):            
        articles = wiki_multi_query(titles)
        
        self.update_next_links(articles)
        self.add_edges(articles)


## TESTS

In [44]:
gc = GraphCreator("Sonata form")
wis = WikiIntroScrapper("https://en.wikipedia.org/wiki/Sonata_form")
wis.parse_intro_links();

In [45]:
wis.intro_link_titles

['Musical form',
 'Classical music era',
 'Movement (music)',
 'Tonality',
 'Exposition (music)',
 'Musical development',
 'Recapitulation (music)',
 'Introduction (music)',
 'Coda (music)',
 'Sonata',
 'Symphony',
 'Concerto',
 'String quartet',
 'Musical analysis']

In [46]:
centrality = gc.get_centrality()
print (wis.title, centrality[wis.title])
for title in wis.intro_link_titles:
    print(title, centrality[title])

Sonata form 0.6044296161693198
Musical form 0.04517436170326216
Classical music era 0.04517436170326216
Movement (music) 0.04517436170326216
Tonality 0.04517436170326216
Exposition (music) 0.04517436170326216
Musical development 0.04517436170326216
Recapitulation (music) 0.04517436170326216
Introduction (music) 0.04517436170326216
Coda (music) 0.04517436170326216
Sonata 0.04517436170326216
Symphony 0.04517436170326216
Concerto 0.04517436170326216
String quartet 0.04517436170326216
Musical analysis 0.04517436170326216


In [47]:
gc.expand_network(group_size=5, timeout=20)

==SKIPPED==
==SKIPPED==
==SKIPPED==
==SKIPPED==
==SKIPPED==
==SKIPPED==
==SKIPPED==


In [48]:
centrality = gc.get_centrality()

rankings = []

rankings.append((wis.title, centrality[wis.title]))
for title in wis.intro_link_titles:
    rankings.append((title, centrality[title]))
    
ranking_df = pd.DataFrame(rankings, columns=["title", "centrality"])
ranking_df.sort_values("centrality", ascending=False).reset_index().drop('index', axis=1)

Unnamed: 0,title,centrality
0,Sonata form,0.141145
1,Sonata,0.10846
2,Symphony,0.080875
3,Concerto,0.071836
4,Movement (music),0.065589
5,String quartet,0.065478
6,Musical form,0.047803
7,Coda (music),0.039774
8,Tonality,0.037013
9,Recapitulation (music),0.032453


In [57]:
nx.dispersion(gc.graph, u="Sonata form", v="Sonata form")

50.97427652733119

In [65]:
pd.DataFrame([("sonata form", node, nx.dispersion(gc.graph, "Sonata form", node)) for node in ranking_df.title], columns=["node1", "node2", "dispersion"])

Unnamed: 0,node1,node2,dispersion
0,sonata form,Sonata form,50.974277
1,sonata form,Musical form,2.166667
2,sonata form,Classical music era,0.0
3,sonata form,Movement (music),0.534884
4,sonata form,Tonality,3.0
5,sonata form,Exposition (music),2.517857
6,sonata form,Musical development,0.0
7,sonata form,Recapitulation (music),1.468085
8,sonata form,Introduction (music),2.081633
9,sonata form,Coda (music),2.358491


In [68]:
nx.degree_pearson_correlation_coefficient(gc.graph, nodes=list(ranking_df.title))

0.11732223870820191