In [1]:
import wikipediaapi
import networkx as nx
from tqdm import tqdm
wiki_wiki = wikipediaapi.Wikipedia('en')

# Dataset Creation

## Graph Creation

Generate a graph sampled from Wikipedia using BFS.

In [2]:
starting_node = "Koala" # Initial node to start BFS from.
hop_nb = 100 # Number of pages allowed to be visited.
graph = nx.Graph()

In [3]:
queue = [wiki_wiki.page(starting_node)]

forbidden_protocols = ["Category", "Template", "Wikipedia", "User", "Help", "Talk", "Portal", "File", "Module"]

node2page = {queue[0].title: queue[0]}

for hop in tqdm(range(hop_nb)):
    if len(queue) > 0:
        page = queue.pop()
        node2page[page.title] = page
        for name, neighbor_page in page.links.items():
            if all([not name.startswith(x) for x in forbidden_protocols]):
                if name not in graph.nodes and neighbor_page not in queue: # If not yet visited & not in queue already
                    queue.append(neighbor_page)
                graph.add_edge(page.title, name)

100%|██████████| 100/100 [00:21<00:00,  4.75it/s]


In [4]:
node2page.keys()

dict_keys(['Koala', 'Zygomaturus trilobus', 'Zygomaturinae', 'Skill', 'U.S. Department of Labor', 'Workforce Investment Act of 1998', 'Youth council', 'Youth work', 'Youth development', 'Young adult', 'Zygote', 'Zygote (disambiguation)', 'Zygote in My Coffee', 'Online magazine', 'Zine', 'Zürich', 'Öschbrig', 'Summit', 'Zenith', 'Zenith telescope', 'Zenith camera', 'Zenit (camera)', 'Zorki', 'Zorki 4', 'Western world', 'Émile Durkheim', 'Étienne de La Boétie', "Workers' self-management", 'Étienne Cabet', 'Z Communications', 'Website', 'World Wide Web Consortium', 'XTiger', 'XSL Transformations', 'Xalan', 'UIMA', 'Watson (computer)', 'Zairja', 'Rabat', 'Écoles Belges au Maroc', 'École internationale de Casablanca', 'Lycée Pierre Mendès France (Tunisia)', 'École Canadienne de Tunis', 'Yang Guang Qing School of Beijing', 'Yew Chung International School of Beijing', "Yong'anli station", 'Yuquan Lu station', 'Shijingshan District', 'Zhangjiakou', 'Ürümqi', 'Ürümqi–Dzungaria railway', 'Zhundo

## Dataset of pairs and distances

In [6]:
import pandas as pd
import numpy as np
import spacy
import random
from numpy import dot
from numpy.linalg import norm
from math import log

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [70]:
tfidf = TfidfVectorizer().fit_transform(["bonjour à tous.", "Au revoir à tous.", "J'ai vu un chien"])

In [81]:
(tfidf[0]*tfidf[2].T).toarray()[0][0]

0.0

In [82]:
def create_dataset(pair_nb = 100) -> pd.DataFrame:
    nlp = spacy.load("en_core_web_sm")
    samples = np.random.choice(list(node2page.keys()), pair_nb*2).tolist()
    summaries = [node2page[x].summary for x in samples]
    categories = [node2page[x].categories for x in tqdm(samples)]
    docs = [x for x in nlp.pipe(samples)]
    pairs = np.array(range(len(samples))).reshape(2, -1) 
    tfidf = TfidfVectorizer().fit_transform(summaries)
    rows = []

    for src, tgt in tqdm(zip(*pairs)):
        length = nx.shortest_path_length(graph, samples[src], samples[tgt])
        doc_a = docs[src]
        doc_b = docs[tgt]
        degree_a = graph.degree[samples[src]]
        degree_b = graph.degree[samples[tgt]]
        similarity = doc_a.similarity(doc_b)
        common_categories = len(set(categories[src]).intersection(categories[tgt]))
        all_categories = len(set(categories[src]).union(categories[tgt])) + 1
        rows.append({
            "length": length, 
            "src": samples[src], 
            "tgt": samples[tgt], 
            "distance": 1 / (1e-1 + similarity), 
            "similarity": similarity, 
            "common_cat": common_categories, 
            "percent_cat": common_categories / all_categories, 
            "degree_sum": degree_a + degree_b, 
            "degree_diff": abs(degree_a - degree_b),
            "tfidf": (tfidf[src]*tfidf[tgt].T).toarray()[0][0]
            })

    return pd.DataFrame(rows)

In [83]:
train_set = create_dataset(5000)
test_set = create_dataset(2000)

100%|██████████| 10000/10000 [00:00<00:00, 2468108.74it/s]
  similarity = doc_a.similarity(doc_b)
5000it [00:02, 1828.22it/s]
100%|██████████| 4000/4000 [00:00<00:00, 2036564.21it/s]
  similarity = doc_a.similarity(doc_b)
2000it [00:01, 1802.84it/s]


In [84]:
test_set.sort_values('length')

Unnamed: 0,length,src,tgt,distance,similarity,common_cat,percent_cat,degree_sum,degree_diff,tfidf
908,0,Zhongding Logistics Park railway station,Zhongding Logistics Park railway station,0.909091,1.000000,0,0.000000,2,0,0.000000
680,0,Đại Việt,Đại Việt,0.909091,1.000000,24,0.960000,1298,0,1.000000
1598,0,Đống Đa District,Đống Đa District,0.909091,1.000000,13,0.928571,6,0,1.000000
557,0,Youth development,Youth development,0.909091,1.000000,5,0.833333,2,0,1.000000
471,0,Đội Bình,Đội Bình,0.909091,1.000000,0,0.000000,2,0,0.000000
...,...,...,...,...,...,...,...,...,...,...
423,6,Youth development,"Xianhe, Hubei",2.288811,0.336908,0,0.000000,2,0,0.000000
1695,6,Zouping County,Zygomaturus trilobus,1.219473,0.720026,2,0.086957,2,0,0.012183
931,6,Zhangqiu railway station,Zygomaturinae,1.691901,0.491051,0,0.000000,86,84,0.000000
1747,6,Zygote in My Coffee,"Đồng Tiến, Ứng Hòa",2.026662,0.393422,0,0.000000,6,4,0.000000


In [86]:
test_set[['length', "common_cat", "degree_sum", "degree_diff", "similarity", "distance", "percent_cat", "tfidf"]].corr()

Unnamed: 0,length,common_cat,degree_sum,degree_diff,similarity,distance,percent_cat,tfidf
length,1.0,-0.364047,-0.371183,-0.245349,-0.209196,0.161999,-0.40366,-0.433249
common_cat,-0.364047,1.0,0.342992,0.102992,0.083655,-0.045474,0.732516,0.643871
degree_sum,-0.371183,0.342992,1.0,0.845315,-0.116407,0.099379,0.109575,0.248019
degree_diff,-0.245349,0.102992,0.845315,1.0,-0.151259,0.135838,-0.02322,0.103951
similarity,-0.209196,0.083655,-0.116407,-0.151259,1.0,-0.911565,0.165424,0.13484
distance,0.161999,-0.045474,0.099379,0.135838,-0.911565,1.0,-0.091783,-0.058501
percent_cat,-0.40366,0.732516,0.109575,-0.02322,0.165424,-0.091783,1.0,0.858004
tfidf,-0.433249,0.643871,0.248019,0.103951,0.13484,-0.058501,0.858004,1.0


# Model

In [35]:
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [87]:
# X_train = np.array(train_set['embedding_diff'].to_list())
X_train = train_set[["common_cat", "degree_sum", "degree_diff", "similarity", "distance", "percent_cat", "tfidf"]]
y_train = train_set['length']

# X_test = np.array(test_set['embedding_diff'].to_list())
X_test = test_set[["common_cat", "degree_sum", "degree_diff", "similarity", "distance", "percent_cat", "tfidf"]]
y_test = test_set['length']

In [92]:
model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.3121730778105397

In [91]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7720872571233237