<a href="https://colab.research.google.com/github/RoiGerber/EcoSystem-simulator/blob/main/SemanticWikiGame.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install sentence-transformers



In [3]:
import sys
import torch
from sentence_transformers import SentenceTransformer, util
import requests
from bs4 import BeautifulSoup
import random
from typing import List, Dict
import numpy as np

# Print debugging information
print("Python executable:", sys.executable)
print("Python path:", sys.path)
print("Torch version:", torch.__version__)

# Load pre-trained model for sentence embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange


Python executable: /usr/bin/python3
Python path: ['/content', '/env/python', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.10/dist-packages/IPython/extensions', '/root/.ipython']
Torch version: 2.3.0+cu121


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [15]:
page_cache = {}
embedding_cache = {}
distances ={}

class WikiPage:
    def __init__(self, title: str, url: str):
        self.title = title
        self.url = url
        if url in page_cache:
            self.text, self.links = page_cache[url]
        else:
            self.text = self.get_text()
            self.links = self.get_links(100)  # Initially fetch 100 links
            page_cache[url] = (self.text, self.links)

    def get_text(self) -> str:
        response = requests.get(self.url)
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        text = ' '.join([para.get_text() for para in paragraphs[:10]])
        return ' '.join(text.split()[:30])  # Get first 30 words

    def get_links(self, max_links: int) -> List[str]:
        soup = BeautifulSoup(requests.get(self.url).content, 'html.parser')
        links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith('/wiki/') and ':' not in href and href != '/wiki/Doi_(identifier)' and 'identifier' not in href:
                full_url = 'https://en.wikipedia.org' + href
                links.append(full_url)
        random.shuffle(links)
        return links[:max_links]

def get_embedding(text: str):
    if text in embedding_cache:
        return embedding_cache[text]
    embedding = model.encode(text, convert_to_tensor=True)
    embedding_cache[text] = embedding
    return embedding

def distance_to_max_links(distance: float) -> int:


    if 0.75 <= distance:
        return 20
    elif 0.65<= distance <= 0.75:
        return 25
    elif 0.55<= distance <= 0.65:
        return 30
    elif 0.40 <= distance <= 0.55:
        return 50
    else:
        return 100

    # Linear interpolation
    return int(310 * np.exp(-3 * distance) - 8)
    # return int(10 + (100 - 10) * (1.0 - distance) / (1.0 - 0.35))


def find_goal_wikipedia(start: WikiPage, goal_title: str) -> List[str]:
    distances[start.url] = 1

    print("Starting from "+str(start.title))
    goal_url = f"https://en.wikipedia.org/wiki/{goal_title}"
    distances[goal_url] = 0

    goal_page = WikiPage(goal_title, goal_url)
    goal_embedding = get_embedding(goal_page.text)

    current_page = start
    path = [current_page.url]
    visited = set()
    visited.add(current_page.url)

    while True:
        # Check if any link is the goal page
        for link in current_page.links:
            if link == goal_url:
                path.append(goal_url)
                return path

        # Calculate semantic distances
        min_distance = float('inf')
        next_page_url = None
        for link in current_page.links:
            if link in visited:
                continue
            link_page = WikiPage(link.split('/wiki/')[1], link)
            link_embedding = get_embedding(link_page.text)
            distance = 1 - util.cos_sim(link_embedding, goal_embedding).item()
            if distance < min_distance and distance > 0:  # Ensure distance > 0 to avoid self-loops
                min_distance = distance
                next_page_url = link

        if next_page_url is None or next_page_url in visited:
            print("Stuck, unable to find a valid next page. Terminating search.")
            break

        # Move to the next page with the smallest semantic distance
        current_page = WikiPage(next_page_url.split('/wiki/')[1], next_page_url)
        path.append(current_page.url)
        visited.add(current_page.url)

        print(f"Moved to {current_page.title}"+"  |  "+ f"Distance: {min_distance}"+"  |  "+f"RandomLinks:{distance_to_max_links(min_distance)}")
        distances[current_page.url] = min_distance
        # print(f"Moved to {current_page.title} Distance: {min_distance} RandomLinks:{distance_to_max_links(min_distance)} ")

        # Update the number of links to fetch based on the distance
        max_links = distance_to_max_links(min_distance)
        current_page.links = current_page.get_links(max_links)

    return []


def get_title_from_url(url: str) -> str:
    return url.split('/wiki/')[-1]

# Example usage:
start_title = "2024_United_States_presidential_election"
goal_title = "LinkedIn"
start_url = f"https://en.wikipedia.org/wiki/{start_title}"

start_page = WikiPage(start_title, start_url)
path = find_goal_wikipedia(start_page, goal_title)


for url in path:
  title = get_title_from_url(url)
  dist = distances[url]
  print(f"vector distance = {dist:.2f}   |   " + title)
  print('V')
print("Done!")


Starting from 2024_United_States_presidential_election
Moved to FiveThirtyEight  |  Distance: 0.7328414916992188  |  RandomLinks:25
Moved to WHOIS  |  Distance: 0.7304809987545013  |  RandomLinks:25
Moved to World_Wide_Web  |  Distance: 0.6202999353408813  |  RandomLinks:30
Moved to HReview  |  Distance: 0.6110538244247437  |  RandomLinks:30
Moved to Solid_(web_decentralization_project)  |  Distance: 0.4586864113807678  |  RandomLinks:50
Moved to Distributed_social_network  |  Distance: 0.5660229921340942  |  RandomLinks:30
Moved to Social_web  |  Distance: 0.4944614768028259  |  RandomLinks:50
Moved to Mobile_social_network  |  Distance: 0.5505470931529999  |  RandomLinks:30
Moved to Twitter  |  Distance: 0.5633731484413147  |  RandomLinks:30
Moved to Yahoo!_Kickstart  |  Distance: 0.4338139295578003  |  RandomLinks:50
vector distance = 1.00   |   2024_United_States_presidential_election
|
V
vector distance = 0.73   |   FiveThirtyEight
|
V
vector distance = 0.73   |   WHOIS
|
V
vector