# Crawling PTA & Berita

### Library

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re, sys, time

## 1. Crawling PTA

In [9]:
BASE_URL = "https://pta.trunojoyo.ac.id/c_search/byprod"

### Fungsi

In [10]:
def get_max_page(prodi_id):
    url = f"{BASE_URL}/{prodi_id}/1"
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")

    # Cari tombol >> (last page)
    last_page = soup.select_one('ol.pagination a:contains("»")')
    if last_page and "href" in last_page.attrs:
        href = last_page["href"]
        # Pecah URL -> ambil angka terakhir
        max_page = int(href.split("/")[-1])
        return max_page

    # fallback kalau pagination tidak ada
    return 1

In [11]:
# Contoh pemakaian
print(get_max_page(10))

172




In [12]:
def print_progress(prodi_id, prodi, current_page, total_pages):
    percent = (current_page / total_pages) * 100
    bar_length = 20
    filled_length = int(bar_length * current_page // total_pages)
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    sys.stdout.write(f'\r[{prodi_id}] {prodi} - Page {current_page}/{total_pages} [{bar}] {percent:.2f}%')
    sys.stdout.flush()
    if current_page == total_pages:
        sys.stdout.write('\n')

### Fungsi Crawling Semua Data PTA

In [27]:
def pta_all():
    start_time = time.time()

    data = {
        "id": [],
        "penulis": [],
        "judul": [],
        "abstrak_id": [],
        "abstrak_en": [],
        "pembimbing_pertama": [],
        "pembimbing_kedua": [],
        "prodi": []
    }

    total_prodi = 3
    total_pages = 0
    max_pages_dict = {}

    # hitung total halaman (untuk tiap prodi)
    for i in range(1, total_prodi + 1):
        max_page = get_max_page(i)
        max_pages_dict[i] = max_page
        total_pages += max_page

    for i in range(1, total_prodi + 1):
        max_page = max_pages_dict[i]
        for j in range(1, max_page + 1):
            url = f"{BASE_URL}/{i}/{j}"
            r = requests.get(url)
            soup = BeautifulSoup(r.content, "html.parser")
            jurnals = soup.select('li[data-cat="#luxury"]')

            isii = soup.select_one('div#begin')
            if not isii:
                continue
            prodi_full = isii.select_one('h2').text.strip()
            prodi = prodi_full.replace("Journal Jurusan ", "")

            for jurnal in jurnals:
                link_keluar = jurnal.select_one('a.gray.button')['href']

                # ambil ID dari link PTA (angka terakhir di URL)
                id_match = re.search(r"/detail/(\d+)", link_keluar)
                pta_id = id_match.group(1) if id_match else None

                response = requests.get(link_keluar)
                soup1 = BeautifulSoup(response.content, "html.parser")
                isi = soup1.select_one('div#content_journal')

                judul = isi.select_one('a.title').text.strip()
                penulis = isi.select_one('span:contains("Penulis")').text.split(' : ')[1]
                pembimbing_pertama = isi.select_one('span:contains("Dosen Pembimbing I")').text.split(' : ')[1]
                pembimbing_kedua = isi.select_one('span:contains("Dosen Pembimbing II")').text.split(' :')[1]

                paragraf = isi.select('p[align="justify"]')
                abstrak_id = paragraf[0].get_text(strip=True) if len(paragraf) > 0 else "N/A"
                abstrak_en = paragraf[1].get_text(strip=True) if len(paragraf) > 1 else "N/A"

                data["id"].append(pta_id)
                data["penulis"].append(penulis)
                data["judul"].append(judul)
                data["abstrak_id"].append(abstrak_id)
                data["abstrak_en"].append(abstrak_en)
                data["pembimbing_pertama"].append(pembimbing_pertama)
                data["pembimbing_kedua"].append(pembimbing_kedua)
                data["prodi"].append(prodi)

            # update progress bar per prodi
            print_progress(i, prodi, j, max_page)

        sys.stdout.write("\n")  # pindah baris setelah 1 prodi selesai

    # simpan ke CSV
    df = pd.DataFrame(data)
    df.to_csv("pta_all.csv", index=False, encoding="utf-8-sig")

    # hitung durasi
    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    # summary
    print("\n✅ Seluruh data berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱️ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    return df

In [28]:
pta_all()

[1] Ilmu Hukum - Page 284/284 [████████████████████] 100.00%

[2] Teknologi Industri Pertanian - Page 114/114 [████████████████████] 100.00%

[3] Agribisnis - Page 110/110 [████████████████████] 100.00%


✅ Seluruh data berhasil dikumpulkan!
📊 Total entri: 2532
⏱️ Waktu eksekusi: 3 jam 48 menit 34 detik


Unnamed: 0,id,penulis,judul,abstrak_id,abstrak_en,pembimbing_pertama,pembimbing_kedua,prodi
0,080111100012,Dyah Ayu Citra Seza,Implementasi Fungsi Legislasi Dewan Perwakilan...,ABSTRAK\r\n\r\n Implementasi Fungsi Legi...,ABSTRACT\r\n Implementation of Legislati...,"Yudi Widagdo Harimurti, SH., MH","Safi', SH., MH",Ilmu Hukum
1,080111100002,Maulina Nurlaily,Pertanggungjawaban Pidana Direksi BUMN (Perser...,Badan Usaha Milik Negara (BUMN) adalah Badan u...,State Owned Enterprises (SOEs) are business en...,"Tolib Effendi, SH., MH.","Dr. Eni Suastuti, SH., Mhum.",Ilmu Hukum
2,070111100060,Moh. Samsul Hidayat,Analisis Terhadap Kekosongan Hukum dalam Penga...,Kasus narkoba tidak henti-hentinya terdengar d...,"Drug cases endlessly heard on television, radi...","Tolib Effendi, SH., MH.","Agus Ramdlany, SH., MH.",Ilmu Hukum
3,090111100077,TOMMY ADITYA PARLINDUNGAN MARBUN,PERLINDUNGAN HUKUM BAGI KONSUMEN ATAS PRODUK E...,Produk elektronik adalah suatu benda bergerak ...,Electronic products is an object moves through...,"DR. DJULAEKA, S.H., M.HUM","DR.USWATUN HASANAH, S.H., M. HUM",Ilmu Hukum
4,070111200007,RICA YENA IMADHORA,TELAAH KRITIS TENTANG ALASAN HUKUM YANG DIGUN...,,,"Dr. DENI SBY, S. H., M. S.","SAIFUL ABDULLAH, S. H., M. H.",Ilmu Hukum
...,...,...,...,...,...,...,...,...
2527,160321100007,Meilinda Sari,Meilinda Sari\n160321100007\nAgribinis,,,"Novi Diana Badrut Tamami, SP.MP","Dr. Teti Sugiatri,SP.,M.Si",Agribisnis
2528,160321100064,Chilyatun Nafisah Oktavina,Chilyatun Nafisah Oktavina\n160321100064\nAgri...,,,"Dr. Teti Sugiarti, SP., M.Si.","Dr. Elys Fauziyah, SP., MP.",Agribisnis
2529,160321100035,I'ANATUS SHOFIYAH,I'ANATUS SHOFIYAH;160321100035;AGRIBISNIS,I'ANATUS SHOFIYAH;160321100035;AGRIBISNIS,I'ANATUS SHOFIYAH;160321100035;AGRIBISNIS,"Dr. Teti Sugiarti, SP., M.Si.","Andrie K Sunyigono, SP., MP., Ph.D.",Agribisnis
2530,160321100038,Lilis Nur Azizah,Lilis Nur Azizah\n160321100038\nAgribisnis,,,"Dr. Teti Sugiarti, S.P., M.Si","Dr. Mardiyah Hayati, S.P., M.P",Agribisnis


### Fungsi Crawling Data PTA (Dengan Batas Page)

In [13]:
def print_progress(prodi_id, prodi, current_page, total_pages):
    percent = (current_page / total_pages) * 100
    bar_length = 20
    filled_length = int(bar_length * current_page // total_pages)
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    sys.stdout.write(f'\r[{prodi_id}] {prodi} - Page {current_page}/{total_pages} [{bar}] {percent:.2f}%')
    sys.stdout.flush()
    if current_page == total_pages:
        sys.stdout.write('\n\n')

def pta():
    start_time = time.time()  # mulai hitung waktu

    data = {
        "id": [],
        "penulis": [],
        "judul": [],
        "abstrak id": [],
        "abstrak en": [],
        "pembimbing_pertama": [],
        "pembimbing_kedua": [],
        "prodi": [],
    }

    for i in range(1, 42):  # jumlah prodi
        total_pages = 15  # jumlah page
        prodi_name = None

        for j in range(1, total_pages + 1):  # loop page
            url = f"https://pta.trunojoyo.ac.id/c_search/byprod/{i}/{j}"
            r = requests.get(url)
            soup = BeautifulSoup(r.content, "html.parser")
            jurnals = soup.select('li[data-cat="#luxury"]')

            isii = soup.select_one('div#begin')
            if not isii:
                continue
            prodi_full = isii.select_one('h2').text.strip()
            prodi = prodi_full.replace("Journal Jurusan ", "")
            if not prodi_name:
                prodi_name = prodi

            for jurnal in jurnals:
                link = jurnal.select_one('a.gray.button')['href']

                # ambil ID dari link PTA
                id_match = re.search(r"/detail/(\d+)", link)
                pta_id = id_match.group(1) if id_match else None

                response = requests.get(link)
                soup1 = BeautifulSoup(response.content, "html.parser")
                isi = soup1.select_one('div#content_journal')

                # Judul
                judul = isi.select_one('a.title').text

                # Penulis
                penulis = isi.select_one('span:contains("Penulis")').text.split(' : ')[1]

                # Pembimbing Pertama
                pembimbing_pertama = isi.select_one('span:contains("Dosen Pembimbing I")').text.split(' : ')[1]

                # Pembimbing Kedua
                pembimbing_kedua = isi.select_one('span:contains("Dosen Pembimbing II")').text.split(' :')[1]

                # Abstrak
                paragraf = isi.select('p[align="justify"]')
                abstrak = paragraf[0].get_text(strip=True) if len(paragraf) > 0 else "N/A"
                abstract = paragraf[1].get_text(strip=True) if len(paragraf) > 1 else "N/A"

                # simpan data
                data["id"].append(pta_id)
                data["penulis"].append(penulis)
                data["judul"].append(judul)
                data["pembimbing_pertama"].append(pembimbing_pertama)
                data["pembimbing_kedua"].append(pembimbing_kedua)
                data["abstrak id"].append(abstrak)
                data["abstrak en"].append(abstract)
                data["prodi"].append(prodi)

            # update progress bar
            print_progress(i, prodi_name, j, total_pages)

    df = pd.DataFrame(data)
    df.to_csv("pta.csv", index=False, encoding="utf-8-sig")

    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    # summary
    print("\n✅ Seluruh data berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱️ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    return df

In [14]:
pta()

[1] Ilmu Hukum - Page 15/15 [████████████████████] 100.00%

[2] Teknologi Industri Pertanian - Page 15/15 [████████████████████] 100.00%

[3] Agribisnis - Page 15/15 [████████████████████] 100.00%

[4] Agroteknologi - Page 15/15 [████████████████████] 100.00%

[5] Ilmu Kelautan - Page 15/15 [████████████████████] 100.00%

[6] Ekonomi Pembangunan - Page 15/15 [████████████████████] 100.00%

[7] Manajemen - Page 15/15 [████████████████████] 100.00%

[8] Akuntansi - Page 15/15 [████████████████████] 100.00%

[9] Teknik Industri - Page 15/15 [████████████████████] 100.00%

[10] Teknik Informatika - Page 15/15 [████████████████████] 100.00%

[11] Manajemen Informatika - Page 15/15 [████████████████████] 100.00%

[12] Sosiologi - Page 15/15 [████████████████████] 100.00%

[13] Ilmu Komunikasi - Page 15/15 [████████████████████] 100.00%

[14] Psikologi - Page 15/15 [████████████████████] 100.00%

[15] Sastra Inggris - Page 15/15 [████████████████████] 100.00%

[16] Ekonomi Syariah - Page 15/1

Unnamed: 0,id,penulis,judul,abstrak id,abstrak en,pembimbing_pertama,pembimbing_kedua,prodi
0,080111100012,Dyah Ayu Citra Seza,Implementasi Fungsi Legislasi Dewan Perwakilan...,ABSTRAK\r\n\r\n Implementasi Fungsi Legi...,ABSTRACT\r\n Implementation of Legislati...,"Yudi Widagdo Harimurti, SH., MH","Safi', SH., MH",Ilmu Hukum
1,080111100002,Maulina Nurlaily,Pertanggungjawaban Pidana Direksi BUMN (Perser...,Badan Usaha Milik Negara (BUMN) adalah Badan u...,State Owned Enterprises (SOEs) are business en...,"Tolib Effendi, SH., MH.","Dr. Eni Suastuti, SH., Mhum.",Ilmu Hukum
2,070111100060,Moh. Samsul Hidayat,Analisis Terhadap Kekosongan Hukum dalam Penga...,Kasus narkoba tidak henti-hentinya terdengar d...,"Drug cases endlessly heard on television, radi...","Tolib Effendi, SH., MH.","Agus Ramdlany, SH., MH.",Ilmu Hukum
3,090111100077,TOMMY ADITYA PARLINDUNGAN MARBUN,PERLINDUNGAN HUKUM BAGI KONSUMEN ATAS PRODUK E...,Produk elektronik adalah suatu benda bergerak ...,Electronic products is an object moves through...,"DR. DJULAEKA, S.H., M.HUM","DR.USWATUN HASANAH, S.H., M. HUM",Ilmu Hukum
4,070111200007,RICA YENA IMADHORA,TELAAH KRITIS TENTANG ALASAN HUKUM YANG DIGUN...,,,"Dr. DENI SBY, S. H., M. S.","SAIFUL ABDULLAH, S. H., M. H.",Ilmu Hukum
...,...,...,...,...,...,...,...,...
2261,160281100013,"Lisa Sri rahmatullah, S. Sos. I",Dampak Sosial Ekonomi Pariwisata Religi Makam ...,Penelitian ini bertujuan untuk mengetahui baga...,The purpose of this study is to analyze the so...,"Dr. Diah Wahyuningsih, S.E., M.Si.","Dr. Eni Sri Rahayuningsih, S.E., M.E.",Magister Ilmu Ekonomi
2262,160281100002,Indah Ainun Nikmah,Peranan Zakat Produktif Dalam Meningkatkan Eko...,Peranan Zakat Produktif dalam Meningkatkan Eko...,The Role of Productive Zakat in Improving Must...,"Dr. Kurniyati Indahsari, M.Si","Dr. Abdur Rahman, S.Ag. MEI",Magister Ilmu Ekonomi
2263,170361100010,ahmad syaiful umam,KARAKTERISASI DAN KOLEKSI PLASMA NUTFAH UNTUK ...,Madura merupakan salah satu wilayah pemasok ko...,Madura is one of the regions supplying horticu...,"Dr. Ir. Gita Pawana, M.Si","Dr. Ir. Hj. SIti Fatimah, M.Si",Magister Pengelolaan Sumber Daya Alam
2264,170361100001,Siti Holifah,PENGOLAHAN LIMBAH AIR REBUSAN IKAN TERI MENJAD...,Ikan Teri perlu penanganan serius pasca panen ...,Anchovy needs serious handling after harvest b...,"Dr.Apri Arisandi,S.Pi.,M.Si.","Dr.Ir.H.Asfan,MP.",Magister Pengelolaan Sumber Daya Alam


## Page & Link Keluar PTA

In [24]:
def print_progress(prodi_id, prodi, current_page, total_pages):
    percent = (current_page / total_pages) * 100
    bar_length = 20
    filled_length = int(bar_length * current_page // total_pages)
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    sys.stdout.write(f'\r[{prodi_id}] {prodi} - Page {current_page}/{total_pages} [{bar}] {percent:.2f}%')
    sys.stdout.flush()
    if current_page == total_pages:
        sys.stdout.write('\n\n')

def pta_links():
    start_time = time.time()  # mulai hitung waktu

    data = {
        "no": [],
        "page": [],
        "link_keluar": []
    }

    no = 1  # nomor urut

    for i in range(1, 10):  # jumlah prodi
        total_pages = 15  # jumlah page
        prodi_name = None

        for j in range(1, total_pages + 1):  # loop page
            url = f"https://pta.trunojoyo.ac.id/c_search/byprod/{i}/{j}"
            r = requests.get(url)
            soup = BeautifulSoup(r.content, "html.parser")
            jurnals = soup.select('li[data-cat="#luxury"]')

            isii = soup.select_one('div#begin')
            if not isii:
                continue
            prodi_full = isii.select_one('h2').text.strip()
            prodi = prodi_full.replace("Journal Jurusan ", "")
            if not prodi_name:
                prodi_name = prodi

            for jurnal in jurnals:
                link = jurnal.select_one('a.gray.button')['href']

                data["no"].append(no)
                data["page"].append(url)          # link page
                data["link_keluar"].append(link)  # link detail
                no += 1

            # update progress bar
            print_progress(i, prodi_name, j, total_pages)

    df = pd.DataFrame(data)
    df.to_csv("pta_links.csv", index=False)

    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    # summary
    print("\n✅ Seluruh link berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱️ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    return df

In [25]:
pta_links()

[1] Ilmu Hukum - Page 15/15 [████████████████████] 100.00%

[2] Teknologi Industri Pertanian - Page 15/15 [████████████████████] 100.00%

[3] Agribisnis - Page 15/15 [████████████████████] 100.00%

[4] Agroteknologi - Page 15/15 [████████████████████] 100.00%

[5] Ilmu Kelautan - Page 15/15 [████████████████████] 100.00%

[6] Ekonomi Pembangunan - Page 15/15 [████████████████████] 100.00%

[7] Manajemen - Page 15/15 [████████████████████] 100.00%

[8] Akuntansi - Page 15/15 [████████████████████] 100.00%

[9] Teknik Industri - Page 15/15 [████████████████████] 100.00%


✅ Seluruh link berhasil dikumpulkan!
📊 Total entri: 675
⏱️ Waktu eksekusi: 0 jam 10 menit 52 detik


Unnamed: 0,no,page,link_keluar
0,1,https://pta.trunojoyo.ac.id/c_search/byprod/1/1,https://pta.trunojoyo.ac.id/welcome/detail/080...
1,2,https://pta.trunojoyo.ac.id/c_search/byprod/1/1,https://pta.trunojoyo.ac.id/welcome/detail/080...
2,3,https://pta.trunojoyo.ac.id/c_search/byprod/1/1,https://pta.trunojoyo.ac.id/welcome/detail/070...
3,4,https://pta.trunojoyo.ac.id/c_search/byprod/1/1,https://pta.trunojoyo.ac.id/welcome/detail/090...
4,5,https://pta.trunojoyo.ac.id/c_search/byprod/1/1,https://pta.trunojoyo.ac.id/welcome/detail/070...
...,...,...,...
670,671,https://pta.trunojoyo.ac.id/c_search/byprod/9/15,https://pta.trunojoyo.ac.id/welcome/detail/080...
671,672,https://pta.trunojoyo.ac.id/c_search/byprod/9/15,https://pta.trunojoyo.ac.id/welcome/detail/080...
672,673,https://pta.trunojoyo.ac.id/c_search/byprod/9/15,https://pta.trunojoyo.ac.id/welcome/detail/060...
673,674,https://pta.trunojoyo.ac.id/c_search/byprod/9/15,https://pta.trunojoyo.ac.id/welcome/detail/080...


## 2. Crawling Berita

### Fungsi

In [19]:
# Fungsi progress bar manual
def print_progress(kategori, current_page, total_pages):
    percent = (current_page / total_pages) * 100
    bar_length = 20
    filled_length = int(bar_length * current_page // total_pages)
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    sys.stdout.write(f'\r{kategori} - Page {current_page}/{total_pages} [{bar}] {percent:.2f}%')
    sys.stdout.flush()
    if current_page == total_pages:
        sys.stdout.write('\n\n')

In [None]:
# Fungsi untuk ambil konten artikel
def get_article_content(url):
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(r.text, "html.parser")

    paragraphs = []
    content_divs = soup.find_all("div", id="content-wrapper")
    for div in content_divs:
        for p in div.find_all("p"):
            text = p.get_text(strip=True)
            if text and not text.lower().startswith("baca juga"):
                paragraphs.append(text)
    return " ".join(paragraphs)

### Fungsi Crawling Berita

In [20]:
def berita(categories, pages_per_category=1):
    start_time = time.time()  # mulai hitung waktu

    BASE_URL = "https://www.tempo.co/indeks?page={}&category=rubrik&rubric_slug={}"

    data = {
        "id_berita": [],
        "judul_berita": [],
        "isi_berita": [],
        "kategori_berita": []
    }

    for cat_id, cat in enumerate(categories, start=1):
        for page in range(1, pages_per_category+1):
            url = BASE_URL.format(page, cat)
            r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
            soup = BeautifulSoup(r.text, "html.parser")

            articles = soup.select("figure figcaption a")
            for a in articles:
                link = "https://www.tempo.co" + a["href"]
                title = a.get_text(strip=True)

                id_match = re.search(r"-(\d+)$", link)
                berita_id = id_match.group(1) if id_match else None

                try:
                    content = get_article_content(link)
                except:
                    content = ""

                data["id_berita"].append(berita_id)
                data["judul_berita"].append(title)
                data["isi_berita"].append(content)
                data["kategori_berita"].append(cat)

            print_progress(cat, page, pages_per_category)

    df = pd.DataFrame(data)
    df.to_csv("tempo_berita.csv", index=False, encoding="utf-8-sig")

    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    # summary
    print("\n✅ Seluruh data berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱️ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    return df

categories = ["politik", "hukum", "ekonomi", "hiburan", "internasional", "otomotif", "olahraga"]

In [21]:
berita(categories, pages_per_category=5)

politik - Page 5/5 [████████████████████] 100.00%

hukum - Page 5/5 [████████████████████] 100.00%

ekonomi - Page 5/5 [████████████████████] 100.00%

hiburan - Page 5/5 [████████████████████] 100.00%

internasional - Page 5/5 [████████████████████] 100.00%

otomotif - Page 5/5 [████████████████████] 100.00%

olahraga - Page 5/5 [████████████████████] 100.00%


✅ Seluruh data berhasil dikumpulkan!
📊 Total entri: 700
⏱️ Waktu eksekusi: 0 jam 0 menit 51 detik


Unnamed: 0,id_berita,judul_berita,isi_berita,kategori_berita
0,2068663,Kronologi Mundurnya Rahayu Saraswati dari DPR,,politik
1,2068660,Kata Wapres Gibran soal Reshuffle Kabinet,,politik
2,2068655,Kata Staf Khusus Gubernur Jakarta soal Tanggul...,,politik
3,2068650,Empat Nama Mencuat dalam Bursa Calon Sekda Kab...,,politik
4,2068644,Top 3 Nasional: Klarifikasi Purbaya hingga Rah...,,politik
...,...,...,...,...
695,2063170,Carlos Alcaraz Berharap Bertemu Sinner di Fina...,,olahraga
696,2063140,Perjalanan Janice Tjen yang Mencetak Sejarah d...,,olahraga
697,2063016,Jadwal Kejuaraan Dunia Bulu Tangkis 2025: 8 Wa...,,olahraga
698,2062822,"Profil Janice Tjen, Petenis Indonesia yang Tem...",,olahraga


## Page & Link Keluar Berita

In [22]:
def berita_links(categories, pages_per_category=1):
    start_time = time.time()  # mulai hitung waktu

    BASE_URL = "https://www.tempo.co/indeks?page={}&category=rubrik&rubric_slug={}"

    data = {
        "id_berita": [],
        "page": [],
        "link_keluar": []
    }

    for cat in categories:
        for page in range(1, pages_per_category+1):
            url = BASE_URL.format(page, cat)
            r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
            soup = BeautifulSoup(r.text, "html.parser")

            articles = soup.select("figure figcaption a")
            for a in articles:
                link = "https://www.tempo.co" + a["href"]
                id_match = re.search(r"-(\d+)$", link)
                berita_id = id_match.group(1) if id_match else None

                data["id_berita"].append(berita_id)
                data["page"].append(url)         # link page
                data["link_keluar"].append(link) # link detail

            # update progress bar
            print_progress(cat, page, pages_per_category)

    df = pd.DataFrame(data)
    df.to_csv("tempo_links.csv", index=False, encoding="utf-8-sig")

    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    # summary
    print("\n✅ Seluruh link berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱️ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    return df

categories = ["politik", "hukum", "ekonomi", "hiburan", "internasional", "otomotif", "olahraga"]

In [23]:
berita_links(categories, pages_per_category=5)

politik - Page 5/5 [████████████████████] 100.00%

hukum - Page 5/5 [████████████████████] 100.00%

ekonomi - Page 5/5 [████████████████████] 100.00%

hiburan - Page 5/5 [████████████████████] 100.00%

internasional - Page 5/5 [████████████████████] 100.00%

otomotif - Page 5/5 [████████████████████] 100.00%

olahraga - Page 5/5 [████████████████████] 100.00%


✅ Seluruh link berhasil dikumpulkan!
📊 Total entri: 700
⏱️ Waktu eksekusi: 0 jam 0 menit 40 detik


Unnamed: 0,id_berita,page,link_keluar
0,2068663,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/kronologi-mundurn...
1,2068660,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/kata-wapres-gibra...
2,2068655,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/kata-staf-khusus-...
3,2068650,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/empat-nama-mencua...
4,2068644,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/top-3-nasional-kl...
...,...,...,...
695,2063170,https://www.tempo.co/indeks?page=5&category=ru...,https://www.tempo.co/olahraga/carlos-alcaraz-b...
696,2063140,https://www.tempo.co/indeks?page=5&category=ru...,https://www.tempo.co/olahraga/perjalanan-janic...
697,2063016,https://www.tempo.co/indeks?page=5&category=ru...,https://www.tempo.co/olahraga/jadwal-kejuaraan...
698,2062822,https://www.tempo.co/indeks?page=5&category=ru...,https://www.tempo.co/olahraga/profil-janice-tj...
