In [1]:
!pip install requests beautifulsoup4



In [2]:
import requests
from bs4 import BeautifulSoup
import re

In [3]:
def fetch_wiki_text(url):
  response = requests.get(url)
  if response.status_code != 200:
    print(f"Failed to fetch {url}")
    return ""

  soup = BeautifulSoup(response.content, "html.parser")

  paragraphs = soup.find_all('p')
  text = " ".join([para.get_text() for para in paragraphs])

  text = re.sub(r'\W+', ' ', text).lower()

  return text

In [4]:
def tokenize_text(text):
  return set(text.split())

In [5]:
def jaccard_similarity(set1, set2):
  intersection = len(set1.intersection(set2))
  union = len(set1.union(set2))
  return intersection/union

In [8]:
urls = [
    "https://en.wikipedia.org/wiki/Apollo_8",
    "https://en.wikipedia.org/wiki/Machine_learning",
    "https://en.wikipedia.org/wiki/Artificial_intelligence"
]

texts = [fetch_wiki_text(url) for url in urls]
tokens = [tokenize_text(text) for text in texts]

In [10]:
print("The Jaccard similarity between pages: ")
for i in range(len(tokens)):
  for j in range(i+1, len(tokens)):
    similarity = jaccard_similarity(tokens[i], tokens[j])
    print(f"Page {i+1} and Page {j+1}: {similarity:.2f}")

The Jaccard similarity between pages: 
Page 1 and Page 2: 0.18
Page 1 and Page 3: 0.19
Page 2 and Page 3: 0.28
