# Simple crawler

# Installing requirements

In [20]:
!pip install requests beautifulsoup4 markdownify lxml pandas

Collecting pandas
  Downloading pandas-2.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting numpy>=1.23.2 (from pandas)
  Using cached numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m971.5 kB/s[0m eta [36m0:00:00[0m:01[0m0:00:01[0m
[?25hUsing cached numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl (16.9 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4

# Understaning the HTML structure

## Extracting html content

In [1]:
import requests
from IPython.display import HTML, display

url = 'https://quotes.toscrape.com/'

response = requests.get(url)
response.raise_for_status()
with open("quotes.html", "w", encoding="utf-8") as f:
    f.write(response.text)


## displaying HTML

In [2]:
display(HTML(response.text))

## Convert HTML to Markdown

In [3]:
from markdownify import markdownify as md
md_content = md(response.text)
with open("quotes.md", "w", encoding='utf-8') as f:
    f.write(md_content)

# Parsing HTML with BeautifulSoup

In [4]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

## Extract all links from the page

In [5]:
import json
links = []
for a in soup.find_all('a', href=True):
    link_text = a.get_text(strip=True)
    link_url = a['href']
    links.append({'text': link_text, 'url': link_url})
with open("links.json", "w") as f:
    json.dump(links, f, ensure_ascii=False, indent=2)


## extract absolute urls
from urllib.parse import urljoin

absolute_links = []
base_url = response.url if hasattr(response, 'url') else "https://quotes.toscrape.com/"
for a in soup.find_all('a', href=True):
    link_text = a.get_text(strip=True)
    link_url = urljoin(base_url, a['href'])
    absolute_links.append({'text': link_text, 'url': link_url})
with open("links_absolute.json", "w", encoding='utf-8') as f:
    json.dump(absolute_links, f, ensure_ascii=False, indent=2)


## Extracting quotes

In [49]:
quotes_data = []
for quote_div in soup.find_all("div", class_="quote"):
    # Extract quote text
    text_span = quote_div.find("span", class_="text")
    quote_text = text_span.get_text(strip=True) if text_span else ""
    # Extract author
    author_small = quote_div.find("small", class_="author")
    author = author_small.get_text(strip=True) if author_small else ""
    # Extract tags
    tags_div = quote_div.find("div", class_="tags")
    tags = []
    if tags_div:
        tags = [a.get_text(strip=True) for a in tags_div.find_all("a", class_="tag")]
    quotes_data.append({
        "text": quote_text,
        "author": author,
        "tags": tags
    })

with open("quotes.json", "w", encoding="utf-8") as f:
    json.dump(quotes_data, f, ensure_ascii=False, indent=2)


## Extract the Top ten tags

In [50]:
# Find the "Top Ten tags" box in the HTML and extract the tags and their URLs
top_ten_tags = []
tags_box = soup.find("div", class_="tags-box")
if tags_box:
    for tag_item in tags_box.find_all("span", class_="tag-item"):
        tag_a = tag_item.find("a", class_="tag")
        if tag_a:
            tag_text = tag_a.get_text(strip=True)
            tag_url = tag_a["href"]
            top_ten_tags.append({"tag": tag_text, "url": tag_url})

with open("top_ten_tags.json", "w", encoding="utf-8") as f:
    json.dump(top_ten_tags, f, ensure_ascii=False, indent=2)


# Parsing by beautifulsoap in lxml mode

In [7]:
import lxml
soup = BeautifulSoup(response.text, 'lxml')

In [8]:
# Extract quotes, authors, and tags using lxml parser
quotes_data_lxml = []
quotes_lxml = soup.find_all("div", class_="quote")
for quote in quotes_lxml:
    quote_text = quote.find("span", class_="text")
    author = quote.find("small", class_="author")
    tags_div = quote.find("div", class_="tags")
    text = quote_text.get_text(strip=True) if quote_text else ""
    author_name = author.get_text(strip=True) if author else ""
    tags = []
    if tags_div:
        tags = [a.get_text(strip=True) for a in tags_div.find_all("a", class_="tag")]
    quotes_data_lxml.append({
        "text": text,
        "author": author_name,
        "tags": tags
    })

# Extract top ten tags using lxml parser
top_ten_tags_lxml = []
tags_box_lxml = soup.find("div", class_="tags-box")
if tags_box_lxml:
    for tag_item in tags_box_lxml.find_all("span", class_="tag-item"):
        tag_a = tag_item.find("a", class_="tag")
        if tag_a:
            tag_text = tag_a.get_text(strip=True)
            tag_url = tag_a["href"]
            top_ten_tags_lxml.append({"tag": tag_text, "url": tag_url})


# Working with css selectors

In [11]:
quotes = soup.select('div.quote span.text')
for quote in quotes:
    print(quote.get_text(strip=True))

“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
“Try not to become a man of success. Rather become a man of value.”
“It is better to be hated for what you are than to be loved for what you are not.”
“I have not failed. I've just found 10,000 ways that won't work.”
“A woman is like a tea bag; you never know how strong it is until it's in hot water.”
“A day without sunshine is like, you know, night.”


# Working with XPath

In [10]:
from lxml import html
tree = html.fromstring(response.text)
quotes = tree.xpath('//div[@class="quote"]/span[@class="text"]/text()')
for quote in quotes:
    print(quote.strip())

“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
“Try not to become a man of success. Rather become a man of value.”
“It is better to be hated for what you are than to be loved for what you are not.”
“I have not failed. I've just found 10,000 ways that won't work.”
“A woman is like a tea bag; you never know how strong it is until it's in hot water.”
“A day without sunshine is like, you know, night.”


# Crawling whole website

In [13]:
import requests
from bs4 import BeautifulSoup

base_url = "http://quotes.toscrape.com"
all_quotes = []

next_page_url = "/"
while next_page_url:
    url = base_url + next_page_url
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "lxml")
    quotes = soup.find_all("div", class_="quote")
    for quote in quotes:
        text = quote.find("span", class_="text").get_text(strip=True)
        author = quote.find("small", class_="author").get_text(strip=True)
        tags = [tag.get_text(strip=True) for tag in quote.find_all("a", class_="tag")]
        all_quotes.append({
            "text": text,
            "author": author,
            "tags": tags
        })
    next_btn = soup.find("li", class_="next")
    if next_btn and next_btn.a:
        next_page_url = next_btn.a["href"]
    else:
        next_page_url = None

# Save the number of quotes and a sample to files
import json

with open("all_quotes.json", "w", encoding="utf-8") as f:
    json.dump(all_quotes, f, ensure_ascii=False, indent=2)


# Crawling quotes website without using the 'more' or 'next' buttons

In [18]:
import requests
from bs4 import BeautifulSoup

base_url = "https://quotes.toscrape.com/page/"
page = 1
while True:
    response = requests.get(f"{base_url}{page}/")
    if response.status_code == 404:
        break
    soup = BeautifulSoup(response.text, 'lxml')
    quotes = soup.select('div.quote span.text')
    num_quotes = 0
    for quote in quotes:
        num_quotes += 1

    print(f"{num_quotes} found in page {page}")
    page += 1

print(f"# Pages: {page}")

10 found in page 1
10 found in page 2
10 found in page 3
10 found in page 4
10 found in page 5
10 found in page 6
10 found in page 7
10 found in page 8
10 found in page 9
10 found in page 10
0 found in page 11
0 found in page 12
0 found in page 13
0 found in page 14
0 found in page 15
0 found in page 16
0 found in page 17
0 found in page 18
0 found in page 19
0 found in page 20
0 found in page 21
0 found in page 22
0 found in page 23
0 found in page 24
0 found in page 25
0 found in page 26
0 found in page 27
0 found in page 28
0 found in page 29
0 found in page 30
0 found in page 31


KeyboardInterrupt: 

# Crawling hackernews

In [15]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

# Check robots.txt for allowed paths
robots_url = "https://news.ycombinator.com/robots.txt"
robots_txt = requests.get(robots_url).text
print("robots.txt:\n", robots_txt)

# According to robots.txt, crawling "/" is allowed for all user-agents.

base_url = "https://news.ycombinator.com/"
page_url = base_url
all_stories = []
max_pages = 30  # Limit to 3 pages to avoid overloading

for i in range(max_pages):
    resp = requests.get(page_url)
    soup = BeautifulSoup(resp.text, "lxml")
    rows = soup.find_all("tr", class_="athing")
    for row in rows:
        title = row.find("a", class_="storylink")
        if not title:
            title = row.find("span", class_="titleline")
            if title:
                title = title.find("a")
        story = {
            "title": title.get_text(strip=True) if title else "",
            "url": title["href"] if title and title.has_attr("href") else "",
            "id": row.get("id")
        }
        all_stories.append(story)
    # Find "More" link for next page
    more = soup.find("a", string="More")
    if more and more.has_attr("href"):
        page_url = urljoin(base_url, more["href"])
        time.sleep(1)  # Be polite
    else:
        break

# Save results
with open("hackernews_stories.json", "w", encoding="utf-8") as f:
    json.dump(all_stories, f, ensure_ascii=False, indent=2)

print(f"Crawled {len(all_stories)} stories from Hacker News.")


robots.txt:
 User-Agent: *
Crawl-delay: 30
Disallow: /collapse?
Disallow: /context?
Disallow: /fave?
Disallow: /flag?
Disallow: /hide?
Disallow: /login
Disallow: /logout
Disallow: /r?
Disallow: /reply?
Disallow: /submitlink?
Disallow: /vote?
Disallow: /x?

Crawled 900 stories from Hacker News.


# Introducing re

In [19]:
import re

# Example: Extract all story IDs that are purely numeric using re from the crawled data
with open("hackernews_stories.json", "r", encoding="utf-8") as f:
    stories = json.load(f)

numeric_id_stories = []
for story in stories:
    if story["id"] and re.fullmatch(r"\d+", story["id"]):
        numeric_id_stories.append(story)

print(f"Found {len(numeric_id_stories)} stories with purely numeric IDs.")

# Example: Find all stories whose title contains the word 'Python' (case-insensitive)
python_stories = [s for s in stories if re.search(r"\bpython\b", s["title"], re.IGNORECASE)]
print(f"Found {len(python_stories)} stories with 'Python' in the title.")


Found 900 stories with purely numeric IDs.
Found 6 stories with 'Python' in the title.


# Conversion to proper data formats

In [21]:
import pandas as pd

# Load the crawled data from the quotes website
with open("all_quotes.json", "r", encoding="utf-8") as f:
    quotes_data = json.load(f)

# Convert the data into a pandas DataFrame
quotes_df = pd.DataFrame(quotes_data)

# Display the first few rows of the DataFrame
print(quotes_df.head())


                                                text           author  \
0  “The world as we have created it is a process ...  Albert Einstein   
1  “It is our choices, Harry, that show what we t...     J.K. Rowling   
2  “There are only two ways to live your life. On...  Albert Einstein   
3  “The person, be it gentleman or lady, who has ...      Jane Austen   
4  “Imperfection is beauty, madness is genius and...   Marilyn Monroe   

                                             tags  
0        [change, deep-thoughts, thinking, world]  
1                            [abilities, choices]  
2  [inspirational, life, live, miracle, miracles]  
3              [aliteracy, books, classic, humor]  
4                    [be-yourself, inspirational]  


In [23]:
# Save the DataFrame to a SQLite database
import sqlite3

# Convert any columns containing lists to strings (e.g., join lists with commas)
import numpy as np

def list_to_str(val):
    if isinstance(val, list):
        return ", ".join(str(x) for x in val)
    return val

quotes_df_clean = quotes_df.applymap(list_to_str)

# Create a connection to a new SQLite database (or connect if it exists)
conn = sqlite3.connect("quotes.db")

# Save the cleaned DataFrame to the SQLite database in a table named 'quotes'
quotes_df_clean.to_sql("quotes", conn, if_exists="replace", index=False)

# Close the connection
conn.close()

# Save the cleaned DataFrame to a CSV file
quotes_df_clean.to_csv("quotes.csv", index=False)


  quotes_df_clean = quotes_df.applymap(list_to_str)
