# Crawling Website PTA

# Install Library


In [1]:
!pip install builtwith
!pip install requests
!pip install beautifulsoup4

Collecting builtwith
  Downloading builtwith-1.3.4.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: builtwith
  Building wheel for builtwith (setup.py) ... [?25l[?25hdone
  Created wheel for builtwith: filename=builtwith-1.3.4-py3-none-any.whl size=36077 sha256=241355d21ced5a01b05f7d6848bbb6809bf854140b2bfca4aaea38e7786d1e32
  Stored in directory: /root/.cache/pip/wheels/7f/2d/b2/606e3df914d4aeeab99c4a4e3e9a61673d2293c2e346db00c8
Successfully built builtwith
Installing collected packages: builtwith
Successfully installed builtwith-1.3.4


# analisis teknologi yang digunakan dalam website

In [2]:
import  builtwith

chek = builtwith.parse('https://pta.trunojoyo.ac.id')
print(chek)

{'web-servers': ['Nginx'], 'javascript-frameworks': ['jQuery', 'jQuery UI']}


# Web Crawling

In [3]:
from os import name
import requests
from bs4 import BeautifulSoup

def crawling_link (url):
  try :
    r = requests.get(url)
    r.raise_for_status()  # Raise an error for bad status codes
    soup = BeautifulSoup(r.text, 'html.parser')

    # mengambil semua judul
    headings = soup.find_all('h1', 'h2','h3')
    for heading in headings:
      print(f"{headings.name}: {heading.get_text()}")

    # mengambil semua link
    links = soup.find_all('a', href = True)
    for link in links:
      print(f"URL: {link['href']} | Teks: {link.get_text()}")

  except requests.exceptions.RequestException as e:
        print(f"Terjadi kesalahan saat mengakses {url}: {e}")

# menampilkan
crawling_link('https://pta.trunojoyo.ac.id')

URL: index.html | Teks: 
URL: # | Teks: 14677Journal
URL: https://pta.trunojoyo.ac.id/ | Teks: Beranda
URL: https://pta.trunojoyo.ac.id/c_search/ | Teks: Pencarian
URL: https://pta.trunojoyo.ac.id/c_template/ | Teks: Download
URL: https://library.trunojoyo.ac.id/detil.php?id=23 | Teks: Petunjuk Upload
URL: https://pta.trunojoyo.ac.id/c_contact/ | Teks: Kontak
URL: # | Teks: STRATEGI PENGEMBANGAN MAKANAN DAN MINUMAN KHAS PULAU GILIGENTING GUNA MENDUKUNG PARIWISATA BERKELANJUTAN
URL: https://pta.trunojoyo.ac.id/welcome/detail/170361100003 | Teks: Selengkapnya
URL: # | Teks: PERUMUSAN SANKSI PIDANA BAGI MASYARAKAT SEKITAR HUTAN YANG MELAKUKAN PENCURIAN KAYU MILIK NEGARA DALAM UNDANG-UNDANG NOMOR 18 TAHUN 2013
URL: https://pta.trunojoyo.ac.id/welcome/detail/170111100053 | Teks: Selengkapnya
URL: # | Teks: Peran Teor Motivasi Herzberg Sebagai Mediator Self Efficacy, Lingkungan Kerja Dalam Meningkatkan Prestasi Kerja Pegawai ( Kantor Jasa Penilai Publik Guntur Eki Andri dan Rekan Kota Suraba

# Web Scraping

In [4]:
import pandas as pd
import requests, time, json, os, sys
from bs4 import BeautifulSoup

PROGRESS_FILE = "pta_progress.json"
DATA_FILE = "pta.csv"

def save_progress(prodi_id, page):
    progress = {}
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, "r") as f:
            progress = json.load(f)
    progress["last_prodi"] = prodi_id
    progress["last_page"] = page
    with open(PROGRESS_FILE, "w") as f:
        json.dump(progress, f)

def load_progress():
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, "r") as f:
            progress = json.load(f)
        return progress.get("last_prodi", 1), progress.get("last_page", 0)
    return 1, 0

def safe_get(session, url, retries=3, delay=2):
    for attempt in range(retries):
        try:
            r = session.get(url, timeout=15)
            r.raise_for_status()
            return r
        except Exception as e:
            print(f"\nGagal akses {url} (percobaan {attempt+1}/{retries}): {e}")
            time.sleep(delay)
    return None

def ptaa(prodi_ids=range(1, 41), pages_per_prodi=3):
    """Scrape PTA Trunojoyo, maksimal pages_per_prodi halaman tiap prodi."""
    # lanjutkan data lama kalau ada
    if os.path.exists(DATA_FILE):
        df = pd.read_csv(DATA_FILE)
        data = df.to_dict(orient="list")
    else:
        data = {
            "penulis": [], "judul": [], "pembimbing_pertama": [],
            "pembimbing_kedua": [], "abstrak": [], "abstraction": []
        }

    session = requests.Session()
    session.headers.update({"User-Agent": "Mozilla/5.0"})

    last_prodi, last_page = load_progress()

    # cek jumlah halaman tiap prodi
    total_pages_all = {}
    for j in prodi_ids:
        url_first = f"https://pta.trunojoyo.ac.id/c_search/byprod/{j}/1"
        r = safe_get(session, url_first)
        if r is None:
            total_pages_all[j] = 1
            continue
        soup = BeautifulSoup(r.content, "html.parser")
        last_link = soup.select_one('ol.pagination a:contains("»")')
        if last_link and last_link.has_attr('href'):
            total_pages = int(last_link['href'].rstrip('/').split('/')[-1])
        else:
            pages = [int(a.text) for a in soup.select('ol.pagination a') if a.text.isdigit()]
            total_pages = max(pages) if pages else 1
        total_pages_all[j] = total_pages

    for j in prodi_ids:
        if j < last_prodi:
            continue

        total_pages = total_pages_all[j]
        print(f"\nProdi {j} punya {total_pages} halaman")

        start_page = last_page + 1 if j == last_prodi else 1
        start_time = time.time()

        # batasi 3 halaman per prodi (atau sesuai parameter)
        max_pages = min(total_pages, pages_per_prodi)

        for i in range(start_page, max_pages + 1):
            try:
                url = f"https://pta.trunojoyo.ac.id/c_search/byprod/{j}/{i}"
                print(f"Scraping {url}")
                r = safe_get(session, url)
                if r is None:
                    continue
                soup = BeautifulSoup(r.content, "html.parser")
                jurnals = soup.select('li[data-cat="#luxury"]')

                for jurnal in jurnals:
                    r_jurnal = safe_get(session, jurnal.select_one('a.gray.button')['href'])
                    if r_jurnal is None:
                        continue
                    soup1 = BeautifulSoup(r_jurnal.content, "html.parser")

                    isi = soup1.select_one('div#content_journal')
                    judul = isi.select_one('a.title').text

                    penulis = isi.select_one('span:contains("Penulis")').text.split(' : ')[1]
                    pembimbing_pertama = isi.select_one('span:contains("Dosen Pembimbing I")').text.split(' : ')[1]
                    pembimbing_kedua = isi.select_one('span:contains("Dosen Pembimbing II")').text.split(' :')[1]

                    ps = isi.select('p[align="justify"]')
                    abstrak = ps[0].get_text(separator=' ', strip=True) if ps else ""
                    abstraction = ps[1].get_text(separator=' ', strip=True) if len(ps) > 1 else ""

                    data["penulis"].append(penulis)
                    data["judul"].append(judul)
                    data["pembimbing_pertama"].append(pembimbing_pertama)
                    data["pembimbing_kedua"].append(pembimbing_kedua)
                    data["abstrak"].append(abstrak)
                    data["abstraction"].append(abstraction)

                # simpan progres dan CSV setiap halaman
                save_progress(j, i)
                pd.DataFrame(data).to_csv(DATA_FILE, index=False, encoding="utf-8-sig")

                time.sleep(1)

            except KeyboardInterrupt:
                print("\nProses dihentikan user. Progres disimpan.")
                pd.DataFrame(data).to_csv(DATA_FILE, index=False, encoding="utf-8-sig")
                return pd.DataFrame(data)
            except Exception as e:
                print(f"\nGagal halaman {i} prodi {j}: {e}")
                continue

    print("\nSelesai scraping.")
    pd.DataFrame(data).to_csv(DATA_FILE, index=False, encoding="utf-8-sig")
    return pd.DataFrame(data)

# Hasil Scraping Website PTA

In [5]:
ptaa()




Prodi 1 punya 284 halaman
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/1/1
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/1/2
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/1/3

Prodi 2 punya 114 halaman
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/2/1
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/2/2
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/2/3

Prodi 3 punya 110 halaman
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/3/1
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/3/2
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/3/3

Prodi 4 punya 116 halaman
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/4/1
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/4/2
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/4/3

Prodi 5 punya 96 halaman
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/5/1
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/5/2
Scraping https://pta.trunojoyo.ac.id/c_search/byprod/5/3

Prodi 6 pu

Unnamed: 0,penulis,judul,pembimbing_pertama,pembimbing_kedua,abstrak,abstraction
0,Dyah Ayu Citra Seza,Implementasi Fungsi Legislasi Dewan Perwakilan...,"Yudi Widagdo Harimurti, SH., MH","Safi', SH., MH",ABSTRAK\r\n\r\n Implementasi Fungsi Legi...,ABSTRACT\r\n Implementation of Legislati...
1,Maulina Nurlaily,Pertanggungjawaban Pidana Direksi BUMN (Perser...,"Tolib Effendi, SH., MH.","Dr. Eni Suastuti, SH., Mhum.",Badan Usaha Milik Negara (BUMN) adalah Badan u...,State Owned Enterprises (SOEs) are business en...
2,Moh. Samsul Hidayat,Analisis Terhadap Kekosongan Hukum dalam Penga...,"Tolib Effendi, SH., MH.","Agus Ramdlany, SH., MH.",Kasus narkoba tidak henti-hentinya terdengar d...,"Drug cases endlessly heard on television, radi..."
3,TOMMY ADITYA PARLINDUNGAN MARBUN,PERLINDUNGAN HUKUM BAGI KONSUMEN ATAS PRODUK E...,"DR. DJULAEKA, S.H., M.HUM","DR.USWATUN HASANAH, S.H., M. HUM",Produk elektronik adalah suatu benda bergerak ...,Electronic products is an object moves through...
4,RICA YENA IMADHORA,TELAAH KRITIS TENTANG ALASAN HUKUM YANG DIGUN...,"Dr. DENI SBY, S. H., M. S.","SAIFUL ABDULLAH, S. H., M. H.",,
...,...,...,...,...,...,...
476,"Lisa Sri rahmatullah, S. Sos. I",Dampak Sosial Ekonomi Pariwisata Religi Makam ...,"Dr. Diah Wahyuningsih, S.E., M.Si.","Dr. Eni Sri Rahayuningsih, S.E., M.E.",Penelitian ini bertujuan untuk mengetahui baga...,The purpose of this study is to analyze the so...
477,Indah Ainun Nikmah,Peranan Zakat Produktif Dalam Meningkatkan Eko...,"Dr. Kurniyati Indahsari, M.Si","Dr. Abdur Rahman, S.Ag. MEI",Peranan Zakat Produktif dalam Meningkatkan Eko...,The Role of Productive Zakat in Improving Must...
478,ahmad syaiful umam,KARAKTERISASI DAN KOLEKSI PLASMA NUTFAH UNTUK ...,"Dr. Ir. Gita Pawana, M.Si","Dr. Ir. Hj. SIti Fatimah, M.Si",Madura merupakan salah satu wilayah pemasok ko...,Madura is one of the regions supplying horticu...
479,Siti Holifah,PENGOLAHAN LIMBAH AIR REBUSAN IKAN TERI MENJAD...,"Dr.Apri Arisandi,S.Pi.,M.Si.","Dr.Ir.H.Asfan,MP.",Ikan Teri perlu penanganan serius pasca panen ...,Anchovy needs serious handling after harvest b...
