# Crawling Link Website

# Install Library


In [1]:
!pip install builtwith
!pip install requests
!pip install beautifulsoup4

Collecting builtwith
  Downloading builtwith-1.3.4.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: builtwith
  Building wheel for builtwith (setup.py) ... [?25l[?25hdone
  Created wheel for builtwith: filename=builtwith-1.3.4-py3-none-any.whl size=36077 sha256=f887fda50c151cf4b517e163621e647dc2128b156467337b74150d1b7b3cf614
  Stored in directory: /root/.cache/pip/wheels/7f/2d/b2/606e3df914d4aeeab99c4a4e3e9a61673d2293c2e346db00c8
Successfully built builtwith
Installing collected packages: builtwith
Successfully installed builtwith-1.3.4


# analisis teknologi yang digunakan dalam website

In [8]:
import builtwith

# Analisis teknologi yang digunakan
res = builtwith.parse('https://sivitas.sakera.id')
print(res)

{'cdn': ['CloudFlare']}


## Crawling Link

In [14]:
import pandas as pd
import requests, time
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def crawl_recursive(start_url, max_depth=2, delay=1):
    """
    Crawl secara rekursif:
    start_url: URL awal
    max_depth: kedalaman maksimum (1 = hanya halaman awal, 2 = halaman awal + link di dalamnya, dst)
    delay: jeda antar request

    Return DataFrame dengan kolom: id, page, link
    """
    session = requests.Session()
    session.headers.update({"User-Agent": "Mozilla/5.0"})

    visited = set()  # URL yang sudah dikunjungi
    results = []
    link_id = 1

    # Queue untuk BFS: (url, depth, parent_url)
    queue = [(start_url, 0, None)]

    while queue:
        current_url, depth, parent = queue.pop(0)
        if current_url in visited:
            continue
        visited.add(current_url)

        # Ambil halaman
        try:
            r = session.get(current_url, timeout=15)
            r.raise_for_status()
        except Exception as e:
            print(f"Gagal akses {current_url}: {e}")
            continue

        soup = BeautifulSoup(r.content, "html.parser")
        all_links = soup.find_all("a", href=True)

        # Simpan setiap link ditemukan
        for a in all_links:
            href = a["href"]
            absolute_link = urljoin(current_url, href)  # jadi URL penuh

            results.append({
                "id": link_id,
                "page": current_url,  # halaman asal
                "link": absolute_link  # link ditemukan di halaman ini
            })
            link_id += 1

            # Jika belum mencapai max_depth, tambahkan ke queue untuk di-crawl lagi
            if depth + 1 < max_depth:
                queue.append((absolute_link, depth + 1, current_url))

        time.sleep(delay)  # jeda agar tidak overload server

    df = pd.DataFrame(results)
    return df

# Contoh penggunaan:
df_crawl = crawl_recursive("https://pta.trunojoyo.ac.id/", max_depth=2)
print(df_crawl.head())

# Simpan ke CSV
df_crawl.to_csv("recursive_crawl.csv", index=False, encoding="utf-8-sig")
print("Hasil tersimpan ke recursive_crawl.csv")


Gagal akses https://pta.trunojoyo.ac.id/index.html: 404 Client Error: Not Found for url: https://pta.trunojoyo.ac.id/index.html
Gagal akses http://trunojoyo.ac.id: HTTPConnectionPool(host='trunojoyo.ac.id', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7bdacb321b20>: Failed to establish a new connection: [Errno 113] No route to host'))
Gagal akses http://e-journal.dikti.go.id: HTTPConnectionPool(host='e-journal.dikti.go.id', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7bdacadfa870>: Failed to resolve 'e-journal.dikti.go.id' ([Errno -2] Name or service not known)"))
   id                          page                                     link
0   1  https://pta.trunojoyo.ac.id/   https://pta.trunojoyo.ac.id/index.html
1   2  https://pta.trunojoyo.ac.id/             https://pta.trunojoyo.ac.id/
2   3  https://pta.trunojoyo.ac.id/         

# hasil crawling link

In [15]:
df = pd.read_csv("recursive_crawl.csv")
df.head()

Unnamed: 0,id,page,link
0,1,https://pta.trunojoyo.ac.id/,https://pta.trunojoyo.ac.id/index.html
1,2,https://pta.trunojoyo.ac.id/,https://pta.trunojoyo.ac.id/
2,3,https://pta.trunojoyo.ac.id/,https://pta.trunojoyo.ac.id/
3,4,https://pta.trunojoyo.ac.id/,https://pta.trunojoyo.ac.id/c_search/
4,5,https://pta.trunojoyo.ac.id/,https://pta.trunojoyo.ac.id/c_template/
