In [12]:
import requests
from bs4 import BeautifulSoup
import certifi

url = "https://kbtu.edu.kz/ru/"

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

response = requests.get(url, headers=headers, verify=False)

print(response)
soup = BeautifulSoup(response.text, "html.parser")

# Find elements
title = soup.find("h1").text
links = [a["href"] for a in soup.find_all("a", href=True)]
all_paragraphs = [p.text for p in soup.find_all("p")]

print(title)

<Response [200]>
События




In [14]:
print(response.text)

<!DOCTYPE html>
<html lang="ru-ru" dir="ltr">

<head>
    <meta charset="utf-8">
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<meta name="description" content="KBTU - Лучший Технический университет Казахстана - образование на английском языке в Алматы - диплом международного уровня - техническое образование в Алматы - Тел. ☎ +7( 727) 357 42 51">
	<meta name="generator" content="Joomla! - Open Source Content Management">
	<title>Казахстанско-Британский технический университет | КБТУ</title>
	<link href="/ru/?format=feed&amp;type=rss" rel="alternate" type="application/rss+xml" title="Казахстанско-Британский технический университет | КБТУ">
	<link href="/ru/?format=feed&amp;type=atom" rel="alternate" type="application/atom+xml" title="Казахстанско-Британский технический университет | КБТУ">
	<link href="https://kbtu.edu.kz/kz/" rel="alternate" hreflang="kz-KZ">
	<link href="https://kbtu.edu.kz/ru/" rel="alternate" hreflang="ru-RU">
	<link href="https://kbtu.edu.k

In [15]:
soup

<!DOCTYPE html>

<html dir="ltr" lang="ru-ru">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="KBTU - Лучший Технический университет Казахстана - образование на английском языке в Алматы - диплом международного уровня - техническое образование в Алматы - Тел. ☎ +7( 727) 357 42 51" name="description"/>
<meta content="Joomla! - Open Source Content Management" name="generator"/>
<title>Казахстанско-Британский технический университет | КБТУ</title>
<link href="/ru/?format=feed&amp;type=rss" rel="alternate" title="Казахстанско-Британский технический университет | КБТУ" type="application/rss+xml"/>
<link href="/ru/?format=feed&amp;type=atom" rel="alternate" title="Казахстанско-Британский технический университет | КБТУ" type="application/atom+xml"/>
<link href="https://kbtu.edu.kz/kz/" hreflang="kz-KZ" rel="alternate"/>
<link href="https://kbtu.edu.kz/ru/" hreflang="ru-RU" rel="alternate"/>
<link href="https://kbtu.edu.kz/en/

In [19]:
"""
Upgraded Web Scraper
- Scrapes a URL and saves raw HTML to disk
- Converts HTML to clean Markdown
- Organizes output into a folder structure
- Handles SSL issues (common with .kz domains)
"""

import requests
import urllib3
import re
from pathlib import Path
from datetime import datetime
from urllib.parse import urlparse
from bs4 import BeautifulSoup

# Suppress SSL warnings (for sites with broken certs like kbtu.edu.kz)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


# ─── Config ────────────────────────────────────────────────────────────────────

OUTPUT_DIR = Path("scraped_output")  # Base folder for all saved files
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}


# ─── Core Functions ─────────────────────────────────────────────────────────────

def fetch_page(url: str) -> tuple[str, str]:
    """Fetch a page and return (html_content, final_url)."""
    response = requests.get(url, headers=HEADERS, verify=False, timeout=15)
    response.raise_for_status()
    return response.text, response.url


def make_output_folder(url: str) -> Path:
    """Create a unique output folder based on domain + timestamp."""
    domain = urlparse(url).netloc.replace(".", "_") + "_" + urlparse(url).path.replace("/", "_")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    folder = OUTPUT_DIR / f"{domain}_{timestamp}"
    folder.mkdir(parents=True, exist_ok=True)
    return folder


def save_html(html: str, folder: Path) -> Path:
    """Save raw HTML to disk."""
    path = folder / "page.html"
    path.write_text(html, encoding="utf-8")
    print(f"  [✓] HTML saved → {path}")
    return path


def html_to_markdown(html: str) -> str:
    """Convert HTML content to clean Markdown."""
    soup = BeautifulSoup(html, "html.parser")

    # Remove noise tags
    for tag in soup(["script", "style", "nav", "footer", "head", "noscript", "iframe"]):
        tag.decompose()

    lines = []

    def process(element):
        if isinstance(element, str):
            text = element.strip()
            if text:
                lines.append(text)
            return

        tag = element.name

        if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
            level = int(tag[1])
            text = element.get_text(separator=" ", strip=True)
            if text:
                lines.append(f"\n{'#' * level} {text}\n")

        elif tag == "p":
            text = element.get_text(separator=" ", strip=True)
            if text:
                lines.append(f"\n{text}\n")

        elif tag in ("ul", "ol"):
            for i, li in enumerate(element.find_all("li", recursive=False), 1):
                text = li.get_text(separator=" ", strip=True)
                bullet = f"{i}." if tag == "ol" else "-"
                if text:
                    lines.append(f"{bullet} {text}")
            lines.append("")

        elif tag == "a":
            href = element.get("href", "")
            text = element.get_text(strip=True)
            if text and href:
                lines.append(f"[{text}]({href})")
            elif text:
                lines.append(text)

        elif tag in ("strong", "b"):
            text = element.get_text(strip=True)
            if text:
                lines.append(f"**{text}**")

        elif tag in ("em", "i"):
            text = element.get_text(strip=True)
            if text:
                lines.append(f"*{text}*")

        elif tag == "blockquote":
            text = element.get_text(separator=" ", strip=True)
            if text:
                lines.append(f"\n> {text}\n")

        elif tag in ("code", "pre"):
            text = element.get_text(strip=True)
            if text:
                lines.append(f"\n```\n{text}\n```\n")

        elif tag == "hr":
            lines.append("\n---\n")

        elif tag == "br":
            lines.append("")

        elif tag == "img":
            alt = element.get("alt", "image")
            src = element.get("src", "")
            if src:
                lines.append(f"![{alt}]({src})")

        elif tag == "table":
            rows = element.find_all("tr")
            for row_idx, row in enumerate(rows):
                cells = row.find_all(["th", "td"])
                row_text = " | ".join(c.get_text(strip=True) for c in cells)
                lines.append(f"| {row_text} |")
                if row_idx == 0:
                    separator = " | ".join(["---"] * len(cells))
                    lines.append(f"| {separator} |")
            lines.append("")

        else:
            for child in element.children:
                process(child)

    body = soup.find("body") or soup
    for child in body.children:
        process(child)

    # Clean up excessive blank lines
    markdown = "\n".join(lines)
    markdown = re.sub(r"\n{3,}", "\n\n", markdown)
    return markdown.strip()


def save_markdown(markdown: str, folder: Path) -> Path:
    """Save Markdown content to disk."""
    path = folder / "page.md"
    path.write_text(markdown, encoding="utf-8")
    print(f"  [✓] Markdown saved → {path}")
    return path


def save_metadata(url: str, soup: BeautifulSoup, folder: Path) -> Path:
    """Save basic page metadata to a text file."""
    title = soup.find("title")
    description = soup.find("meta", attrs={"name": "description"})
    
    meta_lines = [
        f"URL: {url}",
        f"Scraped at: {datetime.now().isoformat()}",
        f"Title: {title.get_text(strip=True) if title else 'N/A'}",
        f"Description: {description['content'] if description and description.get('content') else 'N/A'}",
    ]
    
    path = folder / "metadata.txt"
    path.write_text("\n".join(meta_lines), encoding="utf-8")
    print(f"  [✓] Metadata saved → {path}")
    return path


# ─── Main Scraper ───────────────────────────────────────────────────────────────

def scrape(url: str):
    """Full scraping pipeline: fetch → save HTML → convert to Markdown → save."""
    print(f"\n🔍 Scraping: {url}")
    
    # 1. Fetch
    html, final_url = fetch_page(url)
    soup = BeautifulSoup(html, "html.parser")
    print(f"  [✓] Page fetched ({len(html):,} bytes)")

    # 2. Create output folder
    folder = make_output_folder(final_url)
    print(f"  [✓] Output folder → {folder}")

    # 3. Save raw HTML
    save_html(html, folder)

    # 4. Convert HTML → Markdown and save
    markdown = html_to_markdown(html)
    save_markdown(markdown, folder)

    # 5. Save metadata
    save_metadata(final_url, soup, folder)

    print(f"\n✅ Done! All files saved to: {folder.resolve()}\n")
    return folder


# ─── Run ────────────────────────────────────────────────────────────────────────

urls = [
    "https://kbtu.edu.kz/ru/internatsionalizatsiya/mobilnost-internatsionalizatsiya",
    # Add more URLs here
]

for url in urls:
    try:
        scrape(url)
    except Exception as e:
        print(f"  [✗] Failed to scrape {url}: {e}")


🔍 Scraping: https://kbtu.edu.kz/ru/internatsionalizatsiya/mobilnost-internatsionalizatsiya
  [✓] Page fetched (43,449 bytes)
  [✓] Output folder → scraped_output/kbtu_edu_kz__ru_internatsionalizatsiya_mobilnost-internatsionalizatsiya_20260225_170018
  [✓] HTML saved → scraped_output/kbtu_edu_kz__ru_internatsionalizatsiya_mobilnost-internatsionalizatsiya_20260225_170018/page.html
  [✓] Markdown saved → scraped_output/kbtu_edu_kz__ru_internatsionalizatsiya_mobilnost-internatsionalizatsiya_20260225_170018/page.md
  [✓] Metadata saved → scraped_output/kbtu_edu_kz__ru_internatsionalizatsiya_mobilnost-internatsionalizatsiya_20260225_170018/metadata.txt

✅ Done! All files saved to: /Users/nurma/vscode_projects/Uni-Agent/research/scraped_output/kbtu_edu_kz__ru_internatsionalizatsiya_mobilnost-internatsionalizatsiya_20260225_170018



In [22]:
"""
KBTU Site Crawler
- Starts from a seed URL
- Finds all internal links and crawls them recursively (BFS)
- Saves HTML + Markdown + metadata for every page
- Uses your updated folder naming: domain + path + timestamp
- Respects depth limit and max page limit to avoid infinite crawling
- Skips already-visited pages (no duplicates)
"""

import requests
import urllib3
import re
import time
from pathlib import Path
from datetime import datetime
from urllib.parse import urlparse, urljoin, urldefrag
from bs4 import BeautifulSoup
from collections import deque

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


# ─── Config ────────────────────────────────────────────────────────────────────

SEED_URL       = "https://kbtu.edu.kz/ru/"
ALLOWED_DOMAIN = "kbtu.edu.kz"          # Only follow links on this domain
OUTPUT_DIR     = Path("kbtu_crawl")     # Root output folder
MAX_PAGES      = 10000                    # Safety limit — max pages to crawl
MAX_DEPTH      = 100                      # How deep to follow links (1 = seed only)
DELAY_SECONDS  = 1.0                    # Polite delay between requests (be nice to the server)
HEADERS        = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

# File extensions to skip — these are not HTML pages
SKIP_EXTENSIONS = {
    ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
    ".zip", ".rar", ".jpg", ".jpeg", ".png", ".gif", ".svg",
    ".mp4", ".mp3", ".avi", ".css", ".js", ".xml", ".json"
}


# ─── URL Helpers ───────────────────────────────────────────────────────────────

def normalize_url(url: str) -> str:
    """Remove fragments (#section) and trailing slashes for clean deduplication."""
    url, _ = urldefrag(url)
    return url.rstrip("/")


def is_valid_link(url: str) -> bool:
    """Check if a link belongs to the allowed domain and points to an HTML page."""
    parsed = urlparse(url)
    if parsed.scheme not in ("http", "https"):
        return False
    if ALLOWED_DOMAIN not in parsed.netloc:
        return False
    ext = Path(parsed.path).suffix.lower()
    if ext in SKIP_EXTENSIONS:
        return False
    return True


# ─── Folder Naming (your updated version) ─────────────────────────────────────

def make_output_folder(url: str) -> Path:
    """Create a unique output folder based on domain + path + timestamp."""
    parsed = urlparse(url)
    domain = parsed.netloc.replace(".", "_")
    # Clean up path: replace slashes, remove empty segments
    path_part = parsed.path.replace("/", "_").strip("_")
    folder_name = f"{domain}_{path_part}" if path_part else domain
    # Sanitize: remove unsafe characters and collapse underscores
    folder_name = re.sub(r"[^\w\-]", "_", folder_name)
    folder_name = re.sub(r"_+", "_", folder_name).strip("_")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    folder = OUTPUT_DIR / f"{folder_name}_{timestamp}"
    folder.mkdir(parents=True, exist_ok=True)
    return folder


# ─── Fetching ──────────────────────────────────────────────────────────────────

def fetch_page(url: str) -> str | None:
    """Fetch a URL and return raw HTML, or None on failure."""
    try:
        response = requests.get(url, headers=HEADERS, verify=False, timeout=15)
        response.raise_for_status()
        content_type = response.headers.get("Content-Type", "")
        if "text/html" not in content_type:
            print(f"    [~] Skipped (not HTML): {url}")
            return None
        return response.text
    except Exception as e:
        print(f"    [✗] Failed: {e}")
        return None


def extract_links(html: str, base_url: str) -> list[str]:
    """Pull all valid internal links out of a page."""
    soup = BeautifulSoup(html, "html.parser")
    links = set()
    for tag in soup.find_all("a", href=True):
        href = tag["href"].strip()
        absolute = urljoin(base_url, href)   # Handle relative URLs like /en/about
        normalized = normalize_url(absolute)
        if is_valid_link(normalized):
            links.add(normalized)
    return list(links)


# ─── HTML → Markdown ───────────────────────────────────────────────────────────

def html_to_markdown(html: str) -> str:
    """Convert HTML to clean readable Markdown."""
    soup = BeautifulSoup(html, "html.parser")

    for tag in soup(["script", "style", "nav", "footer", "head", "noscript", "iframe"]):
        tag.decompose()

    lines = []

    def process(element):
        if isinstance(element, str):
            text = element.strip()
            if text:
                lines.append(text)
            return

        tag = element.name
        if tag is None:
            return

        if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
            level = int(tag[1])
            text = element.get_text(separator=" ", strip=True)
            if text:
                lines.append(f"\n{'#' * level} {text}\n")

        elif tag == "p":
            text = element.get_text(separator=" ", strip=True)
            if text:
                lines.append(f"\n{text}\n")

        elif tag in ("ul", "ol"):
            for i, li in enumerate(element.find_all("li", recursive=False), 1):
                text = li.get_text(separator=" ", strip=True)
                bullet = f"{i}." if tag == "ol" else "-"
                if text:
                    lines.append(f"{bullet} {text}")
            lines.append("")

        elif tag == "a":
            href = element.get("href", "")
            text = element.get_text(strip=True)
            if text and href:
                lines.append(f"[{text}]({href})")
            elif text:
                lines.append(text)

        elif tag in ("strong", "b"):
            text = element.get_text(strip=True)
            if text:
                lines.append(f"**{text}**")

        elif tag in ("em", "i"):
            text = element.get_text(strip=True)
            if text:
                lines.append(f"*{text}*")

        elif tag == "blockquote":
            text = element.get_text(separator=" ", strip=True)
            if text:
                lines.append(f"\n> {text}\n")

        elif tag in ("code", "pre"):
            text = element.get_text(strip=True)
            if text:
                lines.append(f"\n```\n{text}\n```\n")

        elif tag == "hr":
            lines.append("\n---\n")

        elif tag == "img":
            alt = element.get("alt", "image")
            src = element.get("src", "")
            if src:
                lines.append(f"![{alt}]({src})")

        elif tag == "table":
            rows = element.find_all("tr")
            for row_idx, row in enumerate(rows):
                cells = row.find_all(["th", "td"])
                row_text = " | ".join(c.get_text(strip=True) for c in cells)
                lines.append(f"| {row_text} |")
                if row_idx == 0:
                    separator = " | ".join(["---"] * len(cells))
                    lines.append(f"| {separator} |")
            lines.append("")

        else:
            for child in element.children:
                process(child)

    body = soup.find("body") or soup
    for child in body.children:
        process(child)

    markdown = "\n".join(lines)
    markdown = re.sub(r"\n{3,}", "\n\n", markdown)
    return markdown.strip()


# ─── Saving ────────────────────────────────────────────────────────────────────

def save_html(html: str, folder: Path):
    path = folder / "page.html"
    path.write_text(html, encoding="utf-8")
    print(f"    [✓] HTML   → {path}")


def save_markdown(markdown: str, folder: Path):
    path = folder / "page.md"
    path.write_text(markdown, encoding="utf-8")
    print(f"    [✓] MD     → {path}")


def save_metadata(url: str, soup: BeautifulSoup, folder: Path, depth: int):
    title = soup.find("title")
    description = soup.find("meta", attrs={"name": "description"})
    content = "\n".join([
        f"URL:         {url}",
        f"Scraped at:  {datetime.now().isoformat()}",
        f"Depth:       {depth}",
        f"Title:       {title.get_text(strip=True) if title else 'N/A'}",
        f"Description: {description['content'] if description and description.get('content') else 'N/A'}",
    ])
    path = folder / "metadata.txt"
    path.write_text(content, encoding="utf-8")
    print(f"    [✓] Meta   → {path}")


# ─── Crawler (BFS) ─────────────────────────────────────────────────────────────

def crawl():
    """
    BFS crawler — visits pages level by level so shallower pages
    are always scraped before deeper ones.
    Queue items: (url, depth)
    """
    visited    = set()
    queue      = deque([(normalize_url(SEED_URL), 0)])
    pages_done = 0

    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    crawl_log  = OUTPUT_DIR / "crawl_log.txt"
    log_lines  = [
        f"Crawl started : {datetime.now().isoformat()}",
        f"Seed URL      : {SEED_URL}",
        f"Domain filter : {ALLOWED_DOMAIN}",
        f"Max pages     : {MAX_PAGES}",
        f"Max depth     : {MAX_DEPTH}",
        ""
    ]

    print(f"\n🕷️  Starting crawl from : {SEED_URL}")
    print(f"   Domain filter  : {ALLOWED_DOMAIN}")
    print(f"   Max pages      : {MAX_PAGES}")
    print(f"   Max depth      : {MAX_DEPTH}")
    print(f"   Output folder  : {OUTPUT_DIR.resolve()}\n")

    while queue and pages_done < MAX_PAGES:
        url, depth = queue.popleft()

        if url in visited:
            continue
        if depth > MAX_DEPTH:
            continue

        visited.add(url)
        pages_done += 1

        print(f"[{pages_done}/{MAX_PAGES}] depth={depth}  {url}")

        # ── Fetch ──
        html = fetch_page(url)
        if not html:
            log_lines.append(f"FAILED   depth={depth}  {url}")
            continue

        soup = BeautifulSoup(html, "html.parser")

        # ── Save ──
        folder = make_output_folder(url)
        save_html(html, folder)
        save_markdown(html_to_markdown(html), folder)
        save_metadata(url, soup, folder, depth)
        log_lines.append(f"OK       depth={depth}  {url}  →  {folder.name}")

        # ── Enqueue new links ──
        if depth < MAX_DEPTH:
            new_links = extract_links(html, url)
            added = 0
            for link in new_links:
                if link not in visited:
                    queue.append((link, depth + 1))
                    added += 1
            print(f"    [+] Queued {added} new links\n")

        time.sleep(DELAY_SECONDS)

    # ── Final log ──
    log_lines += [
        "",
        f"Crawl finished : {datetime.now().isoformat()}",
        f"Pages scraped  : {pages_done}",
    ]
    crawl_log.write_text("\n".join(log_lines), encoding="utf-8")

    print(f"\n✅ Crawl complete!")
    print(f"   Pages scraped : {pages_done}")
    print(f"   Output folder : {OUTPUT_DIR.resolve()}")
    print(f"   Crawl log     : {crawl_log.resolve()}\n")


# ─── Run ───────────────────────────────────────────────────────────────────────
crawl()


🕷️  Starting crawl from : https://kbtu.edu.kz/ru/
   Domain filter  : kbtu.edu.kz
   Max pages      : 10000
   Max depth      : 100
   Output folder  : /Users/nurma/vscode_projects/Uni-Agent/research/kbtu_crawl

[1/10000] depth=0  https://kbtu.edu.kz/ru
    [✓] HTML   → kbtu_crawl/kbtu_edu_kz_ru_20260225_172457/page.html
    [✓] MD     → kbtu_crawl/kbtu_edu_kz_ru_20260225_172457/page.md
    [✓] Meta   → kbtu_crawl/kbtu_edu_kz_ru_20260225_172457/metadata.txt
    [+] Queued 65 new links

[2/10000] depth=1  https://kbtu.edu.kz/ru/ob-universitete/tsentr-esg-kompetentsij-kbtu
    [✓] HTML   → kbtu_crawl/kbtu_edu_kz_ru_ob-universitete_tsentr-esg-kompetentsij-kbtu_20260225_172459/page.html
    [✓] MD     → kbtu_crawl/kbtu_edu_kz_ru_ob-universitete_tsentr-esg-kompetentsij-kbtu_20260225_172459/page.md
    [✓] Meta   → kbtu_crawl/kbtu_edu_kz_ru_ob-universitete_tsentr-esg-kompetentsij-kbtu_20260225_172459/metadata.txt
    [+] Queued 47 new links

[3/10000] depth=1  https://kbtu.edu.kz/ru/student

OSError: [Errno 63] File name too long: 'kbtu_crawl/kbtu_edu_kz_ru_novosty_4806-zavershen-i-etap-vnutrivuzovskogo-konkursa-na-prisvoenie-zvaniya-luchshij-prepodavatel-vuza-2024-v-konkurse-prinyali-uchastie-6-kandidatov-iz-kotorykh-5-pretendentov-napravleny-dlya-uchastiya-v-respublikanskom-etape_20260225_184215'