## Crawling Web 



In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re, sys, time

## 1. Crawling Web PTA

In [None]:
BASE_URL = "https://pta.trunojoyo.ac.id/c_search/byprod"

## Fungsi

In [None]:
def get_max_page(prodi_id):
    url = f"{BASE_URL}/{prodi_id}/1"
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")

    # Cari tombol >> (last page)
    last_page = soup.select_one('ol.pagination a:contains("»")')
    if last_page and "href" in last_page.attrs:
        href = last_page["href"]
        # Pecah URL -> ambil angka terakhir
        max_page = int(href.split("/")[-1])
        return max_page

    # fallback kalau pagination tidak ada
    return 1

In [None]:
# Contoh pemakaian
print(get_max_page(10))

172


In [None]:
import sys
import time

def print_progress(prodi_id, prodi, current_page, total_page, bar_length=40):
    progress = (current_page / total_page) * 100
    filled_length = int(bar_length * current_page // total_page)

    # Warna
    red = "\033[38;2;255;50;50m"
    grey = "\033[38;2;160;160;160m"
    reset = "\033[0m"

    # Pola detak jantung (ECG)
    heartbeat_pattern = ["__", "/", "\\", "__", "_", "_"]
    bar = ""
    for i in range(filled_length):
        char = heartbeat_pattern[(i + current_page) % len(heartbeat_pattern)]
        bar += char

    # Tambahkan sisa bar
    bar = f"{red}{bar}{reset}{grey}{'-' * (bar_length - filled_length)}{reset}"

    sys.stdout.write(
        f"\r[{prodi_id}] {prodi} | {progress:6.2f}% | [{bar}] Page {current_page}/{total_page}"
    )
    sys.stdout.flush()

### Fungsi Crawling Semua Data PTA




In [None]:
def pta_all():
    start_time = time.time()

    data = {
        "no": [],
        "penulis": [],
        "judul": [],
        "abstrak_id": [],
        "abstrak_en": [],
        "pembimbing_pertama": [],
        "pembimbing_kedua": [],
        "prodi": []
    }

    no = 1
    total_prodi = 41
    total_pages = 0
    max_pages_dict = {}

    # hitung total halaman (untuk tiap prodi)
    for i in range(1, total_prodi + 1):
        max_page = get_max_page(i)
        max_pages_dict[i] = max_page
        total_pages += max_page

    for i in range(1, total_prodi + 1):
        max_page = max_pages_dict[i]
        for j in range(1, max_page + 1):
            url = f"{BASE_URL}/{i}/{j}"
            r = requests.get(url)
            soup = BeautifulSoup(r.content, "html.parser")
            jurnals = soup.select('li[data-cat="#luxury"]')

            isii = soup.select_one('div#begin')
            if not isii:
                continue
            prodi_full = isii.select_one('h2').text.strip()
            prodi = prodi_full.replace("Journal Jurusan ", "")

            for jurnal in jurnals:
                link_keluar = jurnal.select_one('a.gray.button')['href']
                response = requests.get(link_keluar)
                soup1 = BeautifulSoup(response.content, "html.parser")
                isi = soup1.select_one('div#content_journal')

                judul = isi.select_one('a.title').text.strip()
                penulis = isi.select_one('span:contains("Penulis")').text.split(' : ')[1]
                pembimbing_pertama = isi.select_one('span:contains("Dosen Pembimbing I")').text.split(' : ')[1]
                pembimbing_kedua = isi.select_one('span:contains("Dosen Pembimbing II")').text.split(' :')[1]

                paragraf = isi.select('p[align="justify"]')
                abstrak_id = paragraf[0].get_text(strip=True) if len(paragraf) > 0 else "N/A"
                abstrak_en = paragraf[1].get_text(strip=True) if len(paragraf) > 1 else "N/A"

                data["no"].append(no)
                data["penulis"].append(penulis)
                data["judul"].append(judul)
                data["abstrak_id"].append(abstrak_id)
                data["abstrak_en"].append(abstrak_en)
                data["pembimbing_pertama"].append(pembimbing_pertama)
                data["pembimbing_kedua"].append(pembimbing_kedua)
                data["prodi"].append(prodi)
                no += 1

            # update progress bar per prodi
            print_progress(i, prodi, j, max_page)

        sys.stdout.write("\n")  # pindah baris setelah 1 prodi selesai

    # simpan ke CSV
    df = pd.DataFrame(data)
    df.to_csv("pta_all.csv", index=False)

    # hitung durasi
    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    # summary
    print("\n✅ Seluruh data berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱️ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    return None

In [None]:
pta_all()

[1] Ilmu Hukum | 100.00% | [[38;2;255;50;50m\______/\______/\______/\______/\______/\______/\____[0m[38;2;160;160;160m[0m] Page 284/284
[2] Teknologi Industri Pertanian | 100.00% | [[38;2;255;50;50m__/\______/\______/\______/\______/\______/\______/\__[0m[38;2;160;160;160m[0m] Page 114/114
[3] Agribisnis | 100.00% | [[38;2;255;50;50m\______/\______/\______/\______/\______/\______/\____[0m[38;2;160;160;160m[0m] Page 110/110
[4] Agroteknologi | 100.00% | [[38;2;255;50;50m\______/\______/\______/\______/\______/\______/\____[0m[38;2;160;160;160m[0m] Page 116/116
[5] Ilmu Kelautan |  21.88% | [[38;2;255;50;50m______/\___[0m[38;2;160;160;160m--------------------------------[0m] Page 21/96

In [None]:
pta = pd.read_csv("pta.csv")
pta

### Fungsi Crawling Data PTA (Dengan Batas Page)


In [None]:
def print_progress(prodi_id, prodi, current_page, total_pages):
    percent = (current_page / total_pages) * 100
    bar_length = 20
    filled_length = int(bar_length * current_page // total_pages)
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    sys.stdout.write(f'\r[{prodi_id}] {prodi} - Page {current_page}/{total_pages} [{bar}] {percent:.2f}%')
    sys.stdout.flush()
    if current_page == total_pages:
        sys.stdout.write('\n\n')

def pta():
    start_time = time.time()  # mulai hitung waktu

    data = {
        "id": [],
        "penulis": [],
        "judul": [],
        "abstrak id": [],
        "abstrak en": [],
        "pembimbing_pertama": [],
        "pembimbing_kedua": [],
        "prodi": [],
    }

    for i in range(1, 42):  # jumlah prodi
        total_pages = 3  # jumlah page
        prodi_name = None

        for j in range(1, total_pages + 1):  # loop page
            url = f"https://pta.trunojoyo.ac.id/c_search/byprod/{i}/{j}"
            r = requests.get(url)
            soup = BeautifulSoup(r.content, "html.parser")
            jurnals = soup.select('li[data-cat="#luxury"]')

            isii = soup.select_one('div#begin')
            if not isii:
                continue
            prodi_full = isii.select_one('h2').text.strip()
            prodi = prodi_full.replace("Journal Jurusan ", "")
            if not prodi_name:
                prodi_name = prodi

            for jurnal in jurnals:
                link = jurnal.select_one('a.gray.button')['href']

                # ambil ID dari link PTA
                id_match = re.search(r"/detail/(\d+)", link)
                pta_id = id_match.group(1) if id_match else None

                response = requests.get(link)
                soup1 = BeautifulSoup(response.content, "html.parser")
                isi = soup1.select_one('div#content_journal')

                # Judul
                judul = isi.select_one('a.title').text

                # Penulis
                penulis = isi.select_one('span:contains("Penulis")').text.split(' : ')[1]

                # Pembimbing Pertama
                pembimbing_pertama = isi.select_one('span:contains("Dosen Pembimbing I")').text.split(' : ')[1]

                # Pembimbing Kedua
                pembimbing_kedua = isi.select_one('span:contains("Dosen Pembimbing II")').text.split(' :')[1]

                # Abstrak
                paragraf = isi.select('p[align="justify"]')
                abstrak = paragraf[0].get_text(strip=True) if len(paragraf) > 0 else "N/A"
                abstract = paragraf[1].get_text(strip=True) if len(paragraf) > 1 else "N/A"

                # simpan data
                data["id"].append(pta_id)
                data["penulis"].append(penulis)
                data["judul"].append(judul)
                data["pembimbing_pertama"].append(pembimbing_pertama)
                data["pembimbing_kedua"].append(pembimbing_kedua)
                data["abstrak id"].append(abstrak)
                data["abstrak en"].append(abstract)
                data["prodi"].append(prodi)

            # update progress bar
            print_progress(i, prodi_name, j, total_pages)

    df = pd.DataFrame(data)
    df.to_csv("pta.csv", index=False, encoding="utf-8-sig")

    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    # summary
    print("\n✅ Seluruh data berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱️ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    return df

In [None]:
pta()

[1] Ilmu Hukum - Page 3/3 [████████████████████] 100.00%

[2] Teknologi Industri Pertanian - Page 3/3 [████████████████████] 100.00%

[3] Agribisnis - Page 3/3 [████████████████████] 100.00%

[4] Agroteknologi - Page 3/3 [████████████████████] 100.00%

[5] Ilmu Kelautan - Page 3/3 [████████████████████] 100.00%

[6] Ekonomi Pembangunan - Page 3/3 [████████████████████] 100.00%

[7] Manajemen - Page 3/3 [████████████████████] 100.00%

[8] Akuntansi - Page 3/3 [████████████████████] 100.00%

[9] Teknik Industri - Page 3/3 [████████████████████] 100.00%

[10] Teknik Informatika - Page 3/3 [████████████████████] 100.00%

[11] Manajemen Informatika - Page 3/3 [████████████████████] 100.00%

[12] Sosiologi - Page 3/3 [████████████████████] 100.00%

[13] Ilmu Komunikasi - Page 3/3 [████████████████████] 100.00%

[14] Psikologi - Page 3/3 [████████████████████] 100.00%

[15] Sastra Inggris - Page 3/3 [████████████████████] 100.00%

[16] Ekonomi Syariah - Page 3/3 [████████████████████] 100.00%

Unnamed: 0,id,penulis,judul,abstrak id,abstrak en,pembimbing_pertama,pembimbing_kedua,prodi
0,080111100012,Dyah Ayu Citra Seza,Implementasi Fungsi Legislasi Dewan Perwakilan...,ABSTRAK\r\n\r\n Implementasi Fungsi Legi...,ABSTRACT\r\n Implementation of Legislati...,"Yudi Widagdo Harimurti, SH., MH","Safi', SH., MH",Ilmu Hukum
1,080111100002,Maulina Nurlaily,Pertanggungjawaban Pidana Direksi BUMN (Perser...,Badan Usaha Milik Negara (BUMN) adalah Badan u...,State Owned Enterprises (SOEs) are business en...,"Tolib Effendi, SH., MH.","Dr. Eni Suastuti, SH., Mhum.",Ilmu Hukum
2,070111100060,Moh. Samsul Hidayat,Analisis Terhadap Kekosongan Hukum dalam Penga...,Kasus narkoba tidak henti-hentinya terdengar d...,"Drug cases endlessly heard on television, radi...","Tolib Effendi, SH., MH.","Agus Ramdlany, SH., MH.",Ilmu Hukum
3,090111100077,TOMMY ADITYA PARLINDUNGAN MARBUN,PERLINDUNGAN HUKUM BAGI KONSUMEN ATAS PRODUK E...,Produk elektronik adalah suatu benda bergerak ...,Electronic products is an object moves through...,"DR. DJULAEKA, S.H., M.HUM","DR.USWATUN HASANAH, S.H., M. HUM",Ilmu Hukum
4,070111200007,RICA YENA IMADHORA,TELAAH KRITIS TENTANG ALASAN HUKUM YANG DIGUN...,,,"Dr. DENI SBY, S. H., M. S.","SAIFUL ABDULLAH, S. H., M. H.",Ilmu Hukum
...,...,...,...,...,...,...,...,...
476,160281100013,"Lisa Sri rahmatullah, S. Sos. I",Dampak Sosial Ekonomi Pariwisata Religi Makam ...,Penelitian ini bertujuan untuk mengetahui baga...,The purpose of this study is to analyze the so...,"Dr. Diah Wahyuningsih, S.E., M.Si.","Dr. Eni Sri Rahayuningsih, S.E., M.E.",Magister Ilmu Ekonomi
477,160281100002,Indah Ainun Nikmah,Peranan Zakat Produktif Dalam Meningkatkan Eko...,Peranan Zakat Produktif dalam Meningkatkan Eko...,The Role of Productive Zakat in Improving Must...,"Dr. Kurniyati Indahsari, M.Si","Dr. Abdur Rahman, S.Ag. MEI",Magister Ilmu Ekonomi
478,170361100010,ahmad syaiful umam,KARAKTERISASI DAN KOLEKSI PLASMA NUTFAH UNTUK ...,Madura merupakan salah satu wilayah pemasok ko...,Madura is one of the regions supplying horticu...,"Dr. Ir. Gita Pawana, M.Si","Dr. Ir. Hj. SIti Fatimah, M.Si",Magister Pengelolaan Sumber Daya Alam
479,170361100001,Siti Holifah,PENGOLAHAN LIMBAH AIR REBUSAN IKAN TERI MENJAD...,Ikan Teri perlu penanganan serius pasca panen ...,Anchovy needs serious handling after harvest b...,"Dr.Apri Arisandi,S.Pi.,M.Si.","Dr.Ir.H.Asfan,MP.",Magister Pengelolaan Sumber Daya Alam


### Fungsi Crawling Data Fakultas Teknik

In [None]:
def pta_prd():
    start_time = time.time()

    data = {
        "id": [],
        "penulis": [],
        "judul": [],
        "abstrak_id": [],
        "abstrak_en": [],
        "pembimbing_pertama": [],
        "pembimbing_kedua": [],
        "prodi": []
    }

    # daftar prodi yang akan diproses
    prodi_list = [9, 10, 11, 19, 20, 23, 31, 32, 33]
    total_pages = 0
    max_pages_dict = {}

    # hitung total halaman (untuk tiap prodi yang dipilih)
    for i in prodi_list:
        max_page = get_max_page(i)
        max_pages_dict[i] = max_page
        total_pages += max_page

    # scraping data tiap prodi
    for i in prodi_list:
        max_page = max_pages_dict[i]
        for j in range(1, max_page + 1):
            url = f"{BASE_URL}/{i}/{j}"
            r = requests.get(url)
            soup = BeautifulSoup(r.content, "html.parser")
            jurnals = soup.select('li[data-cat="#luxury"]')

            isii = soup.select_one('div#begin')
            if not isii:
                continue
            prodi_full = isii.select_one('h2').text.strip()
            prodi = prodi_full.replace("Journal Jurusan ", "")

            for jurnal in jurnals:
                link_keluar = jurnal.select_one('a.gray.button')['href']

                # ambil ID dari link PTA
                id_match = re.search(r"/detail/(\d+)", link_keluar)
                pta_id = id_match.group(1) if id_match else None

                response = requests.get(link_keluar)
                soup1 = BeautifulSoup(response.content, "html.parser")
                isi = soup1.select_one('div#content_journal')

                judul = isi.select_one('a.title').text.strip()
                penulis = isi.select_one('span:contains("Penulis")').text.split(' : ')[1]
                pembimbing_pertama = isi.select_one('span:contains("Dosen Pembimbing I")').text.split(' : ')[1]
                pembimbing_kedua = isi.select_one('span:contains("Dosen Pembimbing II")').text.split(' :')[1]

                paragraf = isi.select('p[align="justify"]')
                abstrak_id = paragraf[0].get_text(strip=True) if len(paragraf) > 0 else "N/A"
                abstrak_en = paragraf[1].get_text(strip=True) if len(paragraf) > 1 else "N/A"

                data["id"].append(pta_id)
                data["penulis"].append(penulis)
                data["judul"].append(judul)
                data["abstrak_id"].append(abstrak_id)
                data["abstrak_en"].append(abstrak_en)
                data["pembimbing_pertama"].append(pembimbing_pertama)
                data["pembimbing_kedua"].append(pembimbing_kedua)
                data["prodi"].append(prodi)

            # update progress bar per prodi
            print_progress(i, prodi, j, max_page)

        sys.stdout.write("\n")  # pindah baris setelah 1 prodi selesai

    # simpan ke CSV
    df = pd.DataFrame(data)
    df.to_csv("pta_prd.csv", index=False, encoding="utf-8-sig")

    # hitung durasi
    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    # summary
    print("\n✅ Seluruh data berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    return df

In [None]:
pta_prd()

[9] Teknik Industri | 100.00% | [[38;2;255;50;50m___/\______/\______/\______/\______/\______/\______/\[0m[38;2;160;160;160m[0m] Page 143/143
[10] Teknik Informatika | 100.00% | [[38;2;255;50;50m____/\______/\______/\______/\______/\______/\______/[0m[38;2;160;160;160m[0m] Page 172/172
[11] Manajemen Informatika | 100.00% | [[38;2;255;50;50m\______/\______/\______/\______/\______/\______/\____[0m[38;2;160;160;160m[0m] Page 56/56
[19] Teknik Multimedia Dan Jaringan | 100.00% | [[38;2;255;50;50m______/\______/\______/\______/\______/\______/\______[0m[38;2;160;160;160m[0m] Page 27/27
[20] Mekatronika | 100.00% | [[38;2;255;50;50m____/\______/\______/\______/\______/\______/\______/[0m[38;2;160;160;160m[0m] Page 28/28
[23] Teknik Elektro | 100.00% | [[38;2;255;50;50m____/\______/\______/\______/\______/\______/\______/[0m[38;2;160;160;160m[0m] Page 34/34
[31] Sistem Informasi | 100.00% | [[38;2;255;50;50m/\______/\______/\______/\______/\______/\______/\___[0m[38

Unnamed: 0,id,penulis,judul,abstrak_id,abstrak_en,pembimbing_pertama,pembimbing_kedua,prodi
0,080421100005,Siliwangi Fitra Rachmawanto S.T.,OPTIMASI PEMILIHAN PORTOFOLIO SAHAM PERUSAHAAN...,Portofolio adalah sekumpulan saham yang dimili...,Portofolio is a collection of stock owned by i...,"Heri Awalul Ilhamsah S.T., M.T.","Retno Indriartiningtias S.T., M.T.",Teknik Industri
1,080421100087,AHMAD MAS'UD,PERANCANGAN TATA LETAK FASILITAS LANTAI PRODUK...,PT. ABC merupakan perusahaan yang bergerak dib...,PT. ABC is a company engaged in the manufactur...,"SABARUDIN AKHMAD, S.T., M.T.","SUGENG PURWOKO, S.T., M.T.",Teknik Industri
2,080421100019,Yulianto Fauzanta,PERUMUSAN STRATEGI BISNIS UD. BUDI JAYA BANGKA...,Bangkalan merupakan salah satu kabupaten yang ...,Bangkalan is one of the districts that have th...,"Fitri Agustina, S.T., M.T","Retno Indriartiningtias, S.T., M.T",Teknik Industri
3,080421100055,M Mundir Muhlisin,USULAN PERBAIKAN UTILITAS RESOURCES PADA LANTA...,Simulasi adalah duplikasi atau abstraksi dari ...,Simulation is a duplication or abstraction of ...,Mu'alim ST MT,Sugeng Purwoko ST MT,Teknik Industri
4,080421100046,Muhibbin,Peningkatan Kepuasan Masyarakat Terhadap Pelay...,Kepuasan adalah tingkat perasaan seseorang ter...,Satisfaction is feeling level of someone to se...,Rahmad Hidayat,Retno Indriartiningtias,Teknik Industri
...,...,...,...,...,...,...,...,...
2284,160491200026,Mochamad Izar Bahroni,Kestabilan Autonomous Quadcopter Dengan Kontro...,Unmanned Aerial Vehicle (UAV) memiliki banyak ...,"Unmanned Aerial Vehicle (UAV) has many uses, i...","Sri Wahyuni, S.Kom., M.T","Faikul Umam, S.Kom., M.T",Teknik Mekatronika
2285,160491200023,Moch Zaini,perancangan dan pembuatan sistem rem elektrik ...,Kendaraan merupakan suatu alat yang banyak dig...,Vehicle is a tool that is widely used by human...,"Hairil Budiarto., ST.MT","Faikul Umam., S.Kom.,MT",Teknik Mekatronika
2286,160491200027,Mohammad Syafruddin,Pengatur Kerapatan Kertas Pada Alat Penggulung...,PT PAKERIN adalah salah satu pabrik kertas ter...,PT PAKERIN is one of the largest paper mills i...,"Faikul Umam S.Kom., M.T","Ahmad Sahru Romadhon S.Kom., M.T",Teknik Mekatronika
2287,160491200028,"Khanif Khoirul Umam, A.Md.",IMPLEMENTASI ALGORITMA MAZE MAPPING PENCARIAN ...,Robot shortest path merupakan jenis robot yang...,Shortest path robot is a type of robot that is...,"Sri Wahyuni, S.Kom., M.T.","Hairil Budiarto, S.T.,M.T.",Teknik Mekatronika


## Page & Link Keluar PTA

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse
import urllib3

# matikan warning SSL insecure
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def scrape_all_links(base_url, max_pages=50):
    visited = set()
    results = []

    def scrape_page(url):
        try:
            response = requests.get(url, verify=False, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')

            # ambil semua link keluar
            for a in soup.find_all('a', href=True):
                href = a['href'].strip()
                if not href or href == "#":
                    continue
                full_link = urljoin(url, href)
                results.append({
                    "Page": url,
                    "Link Keluar": full_link
                })

            # cari link internal untuk dilanjutkan
            for a in soup.find_all('a', href=True):
                href = a['href'].strip()
                if not href or href == "#":
                    continue
                full_link = urljoin(url, href)
                # hanya ambil link internal (masih di domain utama)
                if full_link.startswith(base_url) and full_link not in visited:
                    visited.add(full_link)
                    if len(visited) < max_pages:
                        scrape_page(full_link)

        except Exception as e:
            print(f"⚠ Gagal akses {url}: {e}")

    # mulai dari base_url
    visited.add(base_url)
    scrape_page(base_url)

    # rapikan dataframe
    df = pd.DataFrame(results).reset_index(drop=True)
    df.index += 1
    df.insert(0, "No", df.index)
    return df

# contoh penggunaan
url = "https://informatika.trunojoyo.ac.id/"
df_links = scrape_all_links(url, max_pages=30)  # max_pages = batas biar ga infinite loop

df_links
# df_links.to_csv("semua_link.csv", index=False, encoding="utf-8-sig")

Unnamed: 0,No,Page,Link Keluar
1,1,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/
2,2,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/bidang-min...
3,3,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/bidang-min...
4,4,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/bidang-min...
5,5,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/bidang-min...
...,...,...,...
1704,1704,https://informatika.trunojoyo.ac.id/unit-kegia...,https://ukmfteecom.vercel.app/
1705,1705,https://informatika.trunojoyo.ac.id/unit-kegia...,https://informatika.trunojoyo.ac.id/berita-pro...
1706,1706,https://informatika.trunojoyo.ac.id/unit-kegia...,https://informatika.trunojoyo.ac.id/berita-pro...
1707,1707,https://informatika.trunojoyo.ac.id/unit-kegia...,https://informatika.trunojoyo.ac.id/berita-pro...


## 2. Crawling Berita

### Fungsi

In [None]:
def print_progress_wave(kategori, current_page, total_pages, bar_length=30):
    percent = (current_page / total_pages) * 100
    filled_length = int(bar_length * current_page // total_pages)

    # Warna
    blue = "\033[38;2;0;150;255m"
    grey = "\033[38;2;160;160;160m"
    reset = "\033[0m"

    # Pola ombak (animasi)
    wave_pattern = ["~", "≈", "≋", "∿", "〜"]

    bar = ""
    for i in range(filled_length):
        # animasi hanya kalau < 100%
        if current_page < total_pages:
            wave_char = wave_pattern[(i + current_page) % len(wave_pattern)]
        else:
            wave_char = "~"  # berhenti dengan simbol tunggal
        bar += wave_char

    # Tambahkan track kosong
    bar = f"{blue}{bar}{reset}{grey}{'-' * (bar_length - filled_length)}{reset}"

    sys.stdout.write(f'\r{kategori} - Page {current_page}/{total_pages} [{bar}] {percent:6.2f}%')
    sys.stdout.flush()
    if current_page == total_pages:
        sys.stdout.write('\n\n')

In [None]:
# Fungsi untuk ambil konten artikel
def get_article_content(url):
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(r.text, "html.parser")

    paragraphs = []
    content_divs = soup.find_all("div", id="content-wrapper")
    for div in content_divs:
        for p in div.find_all("p"):
            text = p.get_text(strip=True)
            if text and not text.lower().startswith("baca juga"):
                paragraphs.append(text)
    return " ".join(paragraphs)

### Fungsi Crawling Berita

In [None]:
def berita(categories, pages_per_category=1):
    start_time = time.time()  # mulai hitung waktu

    BASE_URL = "https://www.tempo.co/indeks?page={}&category=rubrik&rubric_slug={}"

    data = {
        "id_berita": [],
        "judul_berita": [],
        "isi_berita": [],
        "kategori_berita": []
    }

    for cat_id, cat in enumerate(categories, start=1):
        for page in range(1, pages_per_category+1):
            url = BASE_URL.format(page, cat)
            r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
            soup = BeautifulSoup(r.text, "html.parser")

            articles = soup.select("figure figcaption a")
            for a in articles:
                link = "https://www.tempo.co" + a["href"]
                title = a.get_text(strip=True)

                id_match = re.search(r"-(\d+)$", link)
                berita_id = id_match.group(1) if id_match else None

                try:
                    content = get_article_content(link)
                except:
                    content = ""

                data["id_berita"].append(berita_id)
                data["judul_berita"].append(title)
                data["isi_berita"].append(content)
                data["kategori_berita"].append(cat)

            print_progress_wave(cat, page, pages_per_category)

    df = pd.DataFrame(data)
    df.to_csv("tempo_berita.csv", index=False, encoding="utf-8-sig")

    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    # summary
    print("\n✅ Seluruh data berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱️ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    return df

categories = ["politik", "hukum", "ekonomi", "lingkungan", "hiburan", "internasional", "otomotif", "olahraga", "sepakbola"]

In [None]:
berita(categories, pages_per_category=5)

politik - Page 5/5 [[38;2;0;150;255m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m[38;2;160;160;160m[0m] 100.00%

hukum - Page 5/5 [[38;2;0;150;255m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m[38;2;160;160;160m[0m] 100.00%

ekonomi - Page 5/5 [[38;2;0;150;255m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m[38;2;160;160;160m[0m] 100.00%

lingkungan - Page 5/5 [[38;2;0;150;255m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m[38;2;160;160;160m[0m] 100.00%

hiburan - Page 5/5 [[38;2;0;150;255m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m[38;2;160;160;160m[0m] 100.00%

internasional - Page 5/5 [[38;2;0;150;255m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m[38;2;160;160;160m[0m] 100.00%

otomotif - Page 5/5 [[38;2;0;150;255m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m[38;2;160;160;160m[0m] 100.00%

olahraga - Page 5/5 [[38;2;0;150;255m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m[38;2;160;160;160m[0m] 100.00%

sepakbola - Page 5/5 [[38;2;0;150;255m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m[38;2;160;160;160m[0m] 100.00%


✅ Seluruh data berhasil dikumpulkan!
📊 Tota

Unnamed: 0,id_berita,judul_berita,isi_berita,kategori_berita
0,2068581,"Profil Rahayu Saraswati, Keponakan Prabowo yan...",POLITIKUS Partai GerindraRahayu SaraswatiDjojo...,politik
1,2068579,Pernyataan Rahayu Saraswati yang Jadi Alasanny...,POLITIKUS PartaiGerindraRahayu SaraswatiDjojoh...,politik
2,2068569,Purbaya Klarifikasi Unggahan Anaknya di Media ...,MENTERI KeuanganPurbayaYudhi Sadewa menanggapi...,politik
3,2068567,Gerindra Memproses Pengunduran Diri Rahayu Sar...,FRAKSI PartaiGerindradi Dewan Perwakilan Rakya...,politik
4,2068546,Rahayu Saraswati Mundur dari DPR,ANGGOTA Fraksi PartaiGerindraDPRRahayu Saraswa...,politik
...,...,...,...,...
895,2066151,Jadwal Siaran Langsung Timnas U-23 Indonesia d...,TIMNAS U-23 Indonesia akan memulai laga babakK...,sepakbola
896,2066144,Pesan Alexander Isak kepada Newcastle Usai Sag...,PEMAIN sepak bolaAlexander Isakberterima kasih...,sepakbola
897,2066121,Prediksi Indonesia vs Laos di Kualifikasi Pial...,DUEL Timnas U-23Indonesia vs Laosakan tersaji ...,sepakbola
898,2066036,Alasan Patrick Kluivert Puji Thom Haye dan Eli...,PELATIHTimnas IndonesiaPatrick Kluivert menila...,sepakbola


### Page & Link Keluar Berita

In [None]:
def berita_links(categories, pages_per_category=1):
    start_time = time.time()  # mulai hitung waktu

    BASE_URL = "https://www.tempo.co/indeks?page={}&category=rubrik&rubric_slug={}"

    data = {
        "id_berita": [],
        "page": [],
        "link_keluar": []
    }

    for cat in categories:
        for page in range(1, pages_per_category+1):
            url = BASE_URL.format(page, cat)
            r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
            soup = BeautifulSoup(r.text, "html.parser")

            articles = soup.select("figure figcaption a")
            for a in articles:
                link = "https://www.tempo.co" + a["href"]
                id_match = re.search(r"-(\d+)$", link)
                berita_id = id_match.group(1) if id_match else None

                data["id_berita"].append(berita_id)
                data["page"].append(url)         # link page
                data["link_keluar"].append(link) # link detail

            # update progress bar
            print_progress(cat, page, pages_per_category)

    df = pd.DataFrame(data)
    df.to_csv("tempo_links.csv", index=False, encoding="utf-8-sig")

    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    # summary
    print("\n✅ Seluruh link berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱️ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    return df

categories = ["politik", "hukum", "ekonomi", "lingkungan", "hiburan", "internasional", "otomotif", "olahraga", "sepakbola"]

In [None]:
berita_links(categories, pages_per_category=5)

politik - Page 5/5 [[38;2;0;150;255m~≈≋∿〜~≈≋∿〜~≈≋∿〜~≈≋∿〜[0m[38;2;160;160;160m[0m] 100.00%

hukum - Page 5/5 [[38;2;0;150;255m~≈≋∿〜~≈≋∿〜~≈≋∿〜~≈≋∿〜[0m[38;2;160;160;160m[0m] 100.00%

ekonomi - Page 5/5 [[38;2;0;150;255m~≈≋∿〜~≈≋∿〜~≈≋∿〜~≈≋∿〜[0m[38;2;160;160;160m[0m] 100.00%

lingkungan - Page 5/5 [[38;2;0;150;255m~≈≋∿〜~≈≋∿〜~≈≋∿〜~≈≋∿〜[0m[38;2;160;160;160m[0m] 100.00%

hiburan - Page 5/5 [[38;2;0;150;255m~≈≋∿〜~≈≋∿〜~≈≋∿〜~≈≋∿〜[0m[38;2;160;160;160m[0m] 100.00%

internasional - Page 5/5 [[38;2;0;150;255m~≈≋∿〜~≈≋∿〜~≈≋∿〜~≈≋∿〜[0m[38;2;160;160;160m[0m] 100.00%

otomotif - Page 5/5 [[38;2;0;150;255m~≈≋∿〜~≈≋∿〜~≈≋∿〜~≈≋∿〜[0m[38;2;160;160;160m[0m] 100.00%

olahraga - Page 5/5 [[38;2;0;150;255m~≈≋∿〜~≈≋∿〜~≈≋∿〜~≈≋∿〜[0m[38;2;160;160;160m[0m] 100.00%

sepakbola - Page 5/5 [[38;2;0;150;255m~≈≋∿〜~≈≋∿〜~≈≋∿〜~≈≋∿〜[0m[38;2;160;160;160m[0m] 100.00%


✅ Seluruh link berhasil dikumpulkan!
📊 Total entri: 900
⏱️ Waktu eksekusi: 0 jam 1 menit 5 detik


Unnamed: 0,id_berita,page,link_keluar
0,2068581,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/profil-rahayu-sar...
1,2068579,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/pernyataan-rahayu...
2,2068569,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/purbaya-klarifika...
3,2068567,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/gerindra-memprose...
4,2068546,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/rahayu-saraswati-...
...,...,...,...
895,2066151,https://www.tempo.co/indeks?page=5&category=ru...,https://www.tempo.co/sepakbola/jadwal-siaran-l...
896,2066144,https://www.tempo.co/indeks?page=5&category=ru...,https://www.tempo.co/sepakbola/pesan-alexander...
897,2066121,https://www.tempo.co/indeks?page=5&category=ru...,https://www.tempo.co/sepakbola/prediksi-indone...
898,2066036,https://www.tempo.co/indeks?page=5&category=ru...,https://www.tempo.co/sepakbola/alasan-patrick-...
