In [None]:
import wikipediaapi
import networkx as nx
from tqdm import tqdm
wiki_wiki = wikipediaapi.Wikipedia('en')

# Dataset Creation

## Graph Creation

Generate a graph sampled from Wikipedia using BFS.

In [None]:
starting_node = "Koala" # Initial node to start BFS from.
hop_nb = 100 # Number of pages allowed to be visited.
graph = nx.Graph()

In [None]:
queue = [wiki_wiki.page(starting_node)]

forbidden_protocols = ["Category", "Template", "Wikipedia", "User", "Help", "Talk", "Portal", "File", "Module"]

node2page = {queue[0].title: queue[0]}

for hop in tqdm(range(hop_nb)):
    if len(queue) > 0:
        page = queue.pop()
        node2page[page.title] = page
        for name, neighbor_page in page.links.items():
            if all([not name.startswith(x) for x in forbidden_protocols]):
                if name not in graph.nodes and neighbor_page not in queue: # If not yet visited & not in queue already
                    queue.append(neighbor_page)
                graph.add_edge(page.title, name)

In [None]:
node2page.keys()

## Dataset of pairs and distances

In [None]:
import pandas as pd
import numpy as np
import spacy
import random
from numpy import dot
from numpy.linalg import norm
from math import log

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer().fit_transform(["bonjour à tous.", "Au revoir à tous.", "J'ai vu un chien"])

In [None]:
(tfidf[0]*tfidf[2].T).toarray()[0][0]

In [None]:
def create_dataset(pair_nb = 100) -> pd.DataFrame:
    nlp = spacy.load("en_core_web_sm")
    samples = np.random.choice(list(node2page.keys()), pair_nb*2).tolist()
    summaries = [node2page[x].summary for x in samples]
    categories = [node2page[x].categories for x in tqdm(samples)]
    docs = [x for x in nlp.pipe(samples)]
    pairs = np.array(range(len(samples))).reshape(2, -1) 
    tfidf = TfidfVectorizer().fit_transform(summaries)
    rows = []

    for src, tgt in tqdm(zip(*pairs)):
        length = nx.shortest_path_length(graph, samples[src], samples[tgt])
        doc_a = docs[src]
        doc_b = docs[tgt]
        degree_a = graph.degree[samples[src]]
        degree_b = graph.degree[samples[tgt]]
        similarity = doc_a.similarity(doc_b)
        common_categories = len(set(categories[src]).intersection(categories[tgt]))
        all_categories = len(set(categories[src]).union(categories[tgt])) + 1
        rows.append({
            "length": length, 
            "src": samples[src], 
            "tgt": samples[tgt], 
            "distance": 1 / (1e-1 + similarity), 
            "similarity": similarity, 
            "common_cat": common_categories, 
            "percent_cat": common_categories / all_categories, 
            "degree_sum": degree_a + degree_b, 
            "degree_diff": abs(degree_a - degree_b),
            "tfidf": (tfidf[src]*tfidf[tgt].T).toarray()[0][0]
            })

    return pd.DataFrame(rows)

In [None]:
train_set = create_dataset(5000)
test_set = create_dataset(2000)

In [None]:
test_set.sort_values('length')

In [None]:
test_set[['length', "common_cat", "degree_sum", "degree_diff", "similarity", "distance", "percent_cat", "tfidf"]].corr()

# Model

In [None]:
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
# X_train = np.array(train_set['embedding_diff'].to_list())
X_train = train_set[["common_cat", "degree_sum", "degree_diff", "similarity", "distance", "percent_cat", "tfidf"]]
y_train = train_set['length']

# X_test = np.array(test_set['embedding_diff'].to_list())
X_test = test_set[["common_cat", "degree_sum", "degree_diff", "similarity", "distance", "percent_cat", "tfidf"]]
y_test = test_set['length']

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
import torch 
torch.cuda.is_available()
