In [1]:
%run "libraries.ipynb"

from IPython.display import display, HTML

from bs4 import BeautifulSoup

## storing the original namespace

In [2]:
pages = codecs.open("data/pagenames.txt","r", "utf-8-sig").readlines()
pages = map(lambda x: x.strip(), pages)

## finding wikipedia links in the page content

In [3]:
hyperlinks_graph = nx.DiGraph()

def find_hyperlinks(page):
    hyperlinks = []
    
    with codecs.open("data/pages/%s.json" % (page), "r", "utf-8-sig") as f:
        j = json.load(f)
        content = j["query"]["pages"][j["query"]["pages"].keys()[0]]
        content = content["revisions"][0]["*"]

    content = BeautifulSoup(content, 'html.parser') 
    
    hyperlinks = content.find_all('a')
        
    return hyperlinks

for p in pages:
    hyperlinks = find_hyperlinks(p)
    
    # keep only title of the link
    hyperlinks = map(lambda x: x.get("title"), hyperlinks)
    hyperlinks = list(set(hyperlinks))

    # keep only linked pages that are inside the initial domain
    intradomain_pages = set(hyperlinks) & set(pages)
    extradomain_pages = set(hyperlinks) - set(pages)

#     print len( hyperlinks )
#     print len( intradomain_pages )
#     print len( extradomain_pages )
    
    for target in intradomain_pages:
        hyperlinks_graph.add_edge(p, target)

print "nodes: %s" % len(hyperlinks_graph.nodes())
print "edges: %s" % len(hyperlinks_graph.edges())

nodes: 300
edges: 3781


## storing the result graph

In [4]:
nx.write_gexf(hyperlinks_graph, "data/hyperlinks.gexf")

## community detection (louvain)

In [5]:
import community
partitions = community.best_partition(hyperlinks_graph.to_undirected())

In [6]:
def print_groups(communities):
    html = "<table>"

    for c, ps in communities.iteritems():
        html += "<tr><td style=\"width: 100px; text-align: center; \"><h3>group %s</h3></td><td>" % (c)
        html += ", ".join(map(lambda x: u"<a href=\"http://en.wikipedia.org/wiki/{0}\" target=\"_blank\">{0}</a>".format(x), ps))
        html += "</td></tr>"
    
    html += "</table>"

    display(HTML(html))
        
communities = {}
for k, v in partitions.iteritems():
    communities.setdefault(v, []).append(k)


print_groups(communities)

0,1
group 0,"Bézier curve, Integral geometry, Discrete geometry, Polygon triangulation, Ehrhart polynomial, Point in polygon, 3D computer graphics, Convex hull, Digital geometry, Spline (mathematics), Geometry of numbers, Non-uniform rational B-spline, Euclidean shortest path, Minkowski's theorem, Image analysis, Delaunay triangulation, Pick's theorem, Computational geometry, B-spline, Hidden line removal, Binary space partitioning, Computer graphics, Minkowski addition, Graham scan, Constructive solid geometry, Convex geometry, Ray tracing (graphics), Point location"
group 1,"Elliptic geometry, Geometrization conjecture, Absolute geometry, Shear mapping, Ordered geometry, Line (geometry), Congruence (geometry), Annulus (mathematics), Klein geometry, Symplectic geometry, Root system, Reflection (mathematics), Erlangen program, 2D computer graphics, Contact geometry, Parallel (geometry), Minkowski space, Similarity (geometry), Riemannian geometry, Non-Euclidean geometry, Information geometry, Homothetic transformation, Coordinate rotations and reflections, Systolic geometry, Invariant (mathematics), Pseudosphere, Ruppeiner geometry, Euclidean geometry, Isometry, Affine transformation, Rotation (mathematics), Point (geometry), Hyperbolic geometry, Euclidean distance, Affine geometry, Sangaku, Translation (geometry), Pythagorean theorem, Four-dimensional space, Transformation geometry, Spherical geometry, Hadwiger's theorem, Differential geometry, Strähle construction, Parallel postulate, Hilbert's axioms"
group 2,"Triangle inequality, Spherical trigonometry, Symmedian, Squaring the circle, Isosceles trapezoid, Equilateral triangle, List of circle topics, Polar sine, Circle, Pi, Incircle and excircles of a triangle, Poncelet–Steiner theorem, Holditch's theorem, Golden angle, Tangential quadrilateral, Rectangle, Heron's formula, Circumscribed circle, Brahmagupta's formula, List of triangle inequalities, Ptolemy's theorem, Van Hiele model, Distance geometry, Power center (geometry), Nine-point circle, Angle trisection, Inscribed angle, Altitude (triangle), Dividing a circle into areas, List of interactive geometry software, Trigonometry, Quadrilateral, Curve of constant width, Kite (geometry), Isoperimetric inequality, Orthodiagonal quadrilateral, Bicentric quadrilateral, Central angle, Equidiagonal quadrilateral, Trapezoid, Pons asinorum, Sphericon, Orthocentric system, Euler line, Right triangle, Isosceles triangle, Concurrent lines, Astronomy, Taxicab geometry, Homothetic center, Integer triangle, Heronian triangle, Straightedge, Mrs. Miniver's problem, Reuleaux triangle, Ball (mathematics), Bretschneider's formula, Pedal triangle, Compass-and-straightedge construction, List of triangle topics, Parallelogram law, Thales' theorem, List of trigonometry topics, Pedoe's inequality, Angle, Acute and obtuse triangles, Pythagorean triple, Triangle, Concyclic points, Cyclic quadrilateral, Rhombus"
group 3,"Prismatoid, Kepler–Poinsot polyhedron, 2D geometric model, Point groups in three dimensions, Tetrahedron, Regular polytope, Polytope compound, Star polygon, Wallpaper group, Square, Penrose tiling, Space group, Convex uniform honeycomb, Polygon, Relative direction, Prototile, Parallelepiped, Aperiodic tiling, Honeycomb (geometry), Frieze group, Crystal, Internal and external angle, Deltahedron, Zonohedron, Regular Polytopes (book), Pyramid (geometry), Quasicrystal, Fractal, Polyhedron, Polytope, Lattice (group), Symmetry, Prism (geometry), Voronoi diagram, Wallace–Bolyai–Gerwien theorem, Glide reflection, Uniform tessellation, Pattern, Wang tile, Platonic solid, Roman surface, Dissection problem, Dihedral angle, Coxeter group, Point groups in two dimensions, Angular defect, Tessellation, Uniform polyhedron, Hilbert's third problem, Symmetry group, Chirality (mathematics), Schläfli symbol, Translational symmetry, Mirror image, Handedness, Johnson solid, Heronian tetrahedron, Archimedean solid, Crystal system, Point group"
group 4,"Steiner chain, Girard Desargues, Kissing number problem, Napkin ring problem, Eccentricity (mathematics), Dandelin spheres, Semi-major axis, Paraboloid, Pappus's centroid theorem, Descriptive geometry, Parametric surface, Mathematical morphology, Cone, Focus (geometry), Leech lattice, Cavalieri's principle, Parabolic reflector, Soddy's hexlet, Kepler conjecture, Geometry, Quadric, Analytic geometry, Conic section, Infinitesimal transformation, Coordinate-free, Sphere, Spheroid, Matrix representation of conic sections, Hyperbola, Mathematics and fiber arts, Torus, Solid geometry, Parabolic microphone, Shape, Ellipsoid, Parabola, Sphere packing, Normal (geometry), Parametric equation, Geometric shape, Locus (mathematics), The Method of Mechanical Theorems, Cross section (geometry), Hyperboloid, Ellipse"
group 5,"Incidence (geometry), Complex geometry, Oval (projective plane), Affine space, Enumerative geometry, Duality (projective geometry), Riemann sphere, Algebraic geometry, Birational geometry, Line at infinity, Group action, Topology, Inversive geometry, Parabolic geometry (differential geometry), Complex projective plane, Projective geometry, Borromean rings, Pascal's theorem, Hyperplane at infinity, Cross-ratio, Projective line over a ring, Finite geometry, Homogeneous coordinates, Projective line, Plane at infinity, N-sphere, Desargues' theorem, Lie sphere geometry, Point at infinity, 3-sphere, Quantum geometry, Monge's theorem, Incidence geometry, Toric variety, Tropical geometry, Pappus's hexagon theorem, Synthetic geometry, 3D projection, Arc (projective geometry), Projective plane, Möbius transformation, Mathematics, Noncommutative geometry, Conformal geometry, Stereographic projection, Homography, Hermite spline, Projective space, Hyperplane"
group 6,Epipolar geometry
