In [1]:
import wikipediaapi
import networkx as nx
wiki_wiki = wikipediaapi.Wikipedia('en')

# Dataset Creation

## Graph Creation

Generate a graph sampled from Wikipedia using BFS.

In [194]:
starting_node = "Koala" # Initial node to start BFS from.
hop_nb = 10 # Number of pages allowed to be visited.
graph = nx.Graph()

In [195]:
queue = [wiki_wiki.page(starting_node)]

forbidden_protocols = ["Category", "Template", "Wikipedia", "User", "Help", "Talk", "Portal"]

node2page = {queue[0].title: queue[0]}

for hop in range(hop_nb):
    if len(queue) > 0:
        page = queue.pop()
        for name, neighbor_page in page.links.items():
            if all([not name.startswith(x) for x in forbidden_protocols]):
                if name not in graph.nodes and neighbor_page not in queue: # If not yet visited & not in queue already
                    queue.append(neighbor_page)
                graph.add_edge(page.title, name)
                node2page[name] = neighbor_page

## Dataset of pairs and distances

In [233]:
import pandas as pd
import numpy as np
import spacy
import random
from numpy import dot
from numpy.linalg import norm
from math import log

In [236]:
def create_dataset(pair_nb = 100) -> pd.DataFrame:
    nlp = spacy.load("en_core_web_sm")
    samples = np.random.choice(list(node2page.keys()), pair_nb*2).tolist()
    summaries = [node2page[x].summary for x in samples]
    docs = [x for x in nlp.pipe(summaries)]
    pairs = np.array(range(len(samples))).reshape(2, -1) 
    rows = []

    for src, tgt in zip(*pairs):
        length = nx.shortest_path_length(graph, samples[src], samples[tgt])
        doc_a = docs[src]
        doc_b = docs[tgt]
        distance = 1 / (1e-1 + doc_a.similarity(doc_b))
        distance = -log(doc_a.similarity(doc_b) + 1e-3)
        rows.append({"length": length, "src": samples[src], "tgt": samples[tgt], "distance": distance})

    return pd.DataFrame(rows)

In [241]:
# train_set = create_dataset(10)
test_set = create_dataset(200)

  distance = 1 / (1e-1 + doc_a.similarity(doc_b))
  distance = -log(doc_a.similarity(doc_b) + 1e-3)
  distance = 1 / (1e-1 + doc_a.similarity(doc_b))
  distance = -log(doc_a.similarity(doc_b) + 1e-3)


In [242]:
test_set.sort_values('distance')

Unnamed: 0,length,src,tgt,distance
97,5,Noozles,Seattle,0.041987
77,2,Pathogen,Printmaking,0.061552
85,5,Diprotodontoidea,United States,0.068033
177,4,San Francisco,Occupational Safety and Health Act,0.068669
22,2,Compassion,Sigmund Freud,0.083609
...,...,...,...,...
165,3,Enterprise communities,Free-range parenting,6.907755
129,4,Mexico,Vocational Rehabilitation Act,6.907755
118,4,Nimbavombatus,Youth leadership,6.907755
83,4,Kangaroo Island,Budget Reconciliation Act,6.907755


# Model

In [152]:
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [168]:
X_train = np.array(train_set['embedding_diff'].to_list())
X_train = train_set[['distance']]
y_train = train_set['length']

X_test = np.array(test_set['embedding_diff'].to_list())
X_test = test_set[['distance']]
y_test = test_set['length']

In [169]:
model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

-0.016786290624264844