In [1]:
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import defaultdict
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [4]:
# Section 1: Web Crawler

# Seed URL
seed_url = ("https://www.airbnb.com/")


output_dir = "crawled_pages"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

visited_urls = set()

url_queue = [seed_url]

# Delay between requests to avoid overloading the server
delay = 2  # seconds

def save_page_content(url, content):
    filename = os.path.join(output_dir, f"{len(visited_urls)}.txt")
    with open(filename, "w", encoding="utf-8") as file:
        file.write(content)

# Function to crawl the web
def crawl_web():
    while len(visited_urls) < 500 and url_queue:
        url = url_queue.pop(0)
        if url not in visited_urls:
            try:
                print(f"Crawling: {url}")
                response = requests.get(url)
                if response.status_code == 200:
                    content = response.text
                    save_page_content(url, content)
                    visited_urls.add(url)
                    soup = BeautifulSoup(content, "html.parser")
                    for link in soup.find_all("a", href=True):
                        full_url = urljoin(url, link["href"])
                        if full_url.startswith(seed_url) and full_url not in visited_urls:
                            url_queue.append(full_url)
                    time.sleep(delay)
            except Exception as e:
                print(f"Error crawling {url}: {e}")

crawl_web()

Crawling: https://www.airbnb.com/
Crawling: https://www.airbnb.com/#site-content
Crawling: https://www.airbnb.com/host/homes
Crawling: https://www.airbnb.com/#simple-header-locale-menu-combined
Crawling: https://www.airbnb.com/#simple-header-profile-menu
Crawling: https://www.airbnb.com/signup_login
Crawling: https://www.airbnb.com/login
Crawling: https://www.airbnb.com/giftcards
Crawling: https://www.airbnb.com/host/experiences?from_nav=1
Crawling: https://www.airbnb.com/help
Crawling: https://www.airbnb.com/rooms/24991868?adults=1&check_in=2025-03-30&check_out=2025-04-04&children=0&infants=0&pets=0&photo_id=896147277&source_impression_id=p3_1740041506_P3zMQNd02qeHnIQg&previous_page_section_name=1000&federated_search_id=197f4e79-1f67-4cb2-be1e-375ad5a344ca
Crawling: https://www.airbnb.com/rooms/43855768?adults=1&check_in=2025-03-02&check_out=2025-03-07&children=0&infants=0&pets=0&photo_id=1085548392&source_impression_id=p3_1740041506_P3k67qJOseSnOIGh&previous_page_section_name=1000&fe

In [6]:
# Section 2: Indexing
import nltk

nltk.download('punkt_tab') # Download 'punkt_tab' data package
nltk.download('stopwords')

import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import defaultdict
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


def tokenize_and_normalize(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    # Apply stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return tokens

# Inverted index
inverted_index = defaultdict(list)

# Function to build inverted index
def build_inverted_index():
    for doc_id in range(len(visited_urls)):
        filename = os.path.join(output_dir, f"{doc_id}.txt")
        with open(filename, "r", encoding="utf-8") as file:
            content = file.read()
            tokens = tokenize_and_normalize(content)
            for token in tokens:
                inverted_index[token].append(doc_id)

# Build the inverted index
build_inverted_index()

# Save the inverted index to a file
with open("inverted_index.txt", "w", encoding="utf-8") as file:
    for token, doc_ids in inverted_index.items():
        file.write(f"{token}: {', '.join(map(str, doc_ids))}\n")

print("Crawling and indexing completed.")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Crawling and indexing completed.


In [7]:
from google.colab import files

# Download the inverted index file
if os.path.exists("inverted_index.txt"):
    files.download("inverted_index.txt")
else:
    print("inverted_index.txt does not exist.")

# Download the crawled pages (zip the directory first)
if os.path.exists("crawled_pages"):
    !zip -r crawled_pages.zip crawled_pages
    files.download("crawled_pages.zip")
else:
    print("crawled_pages directory does not exist.")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: crawled_pages/ (stored 0%)
  adding: crawled_pages/252.txt (deflated 82%)
  adding: crawled_pages/492.txt (deflated 88%)
  adding: crawled_pages/463.txt (deflated 88%)
  adding: crawled_pages/182.txt (deflated 81%)
  adding: crawled_pages/476.txt (deflated 88%)
  adding: crawled_pages/56.txt (deflated 88%)
  adding: crawled_pages/335.txt (deflated 88%)
  adding: crawled_pages/230.txt (deflated 89%)
  adding: crawled_pages/386.txt (deflated 88%)
  adding: crawled_pages/444.txt (deflated 91%)
  adding: crawled_pages/153.txt (deflated 81%)
  adding: crawled_pages/42.txt (deflated 87%)
  adding: crawled_pages/455.txt (deflated 88%)
  adding: crawled_pages/14.txt (deflated 81%)
  adding: crawled_pages/170.txt (deflated 81%)
  adding: crawled_pages/139.txt (deflated 90%)
  adding: crawled_pages/407.txt (deflated 88%)
  adding: crawled_pages/296.txt (deflated 82%)
  adding: crawled_pages/416.txt (deflated 88%)
  adding: crawled_pages/419.txt (deflated 88%)
  adding: crawled_pages/97

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>