In [1]:
from modules.scrapper import Scrapper
from modules.filter import Filter
from tqdm import tqdm
import pandas as pd
import threading
import time
from typing import Any
from tqdm import tqdm
from threading import Thread


In [2]:
scrapper = Scrapper()
filter = Filter()
lock = threading.Lock()

In [3]:
LINKS = {
    "liputan6": {
        "non-sepakbola": [
            "https://www.liputan6.com/tag/motogp/text",
            "https://www.liputan6.com/tag/badminton/text"
        ],
        "liga-inggris": ["https://www.liputan6.com/tag/liga-inggris/text"],
        "liga-indonesia": ["https://www.liputan6.com/tag/liga-indonesia/text"],
        "liga-spanyol": ["https://www.liputan6.com/tag/liga-spanyol/text"],
        "liga-italia": ["https://www.liputan6.com/tag/liga-italia/text"]
    },
    "detik": {
        "non-sepakbola": ["https://sport.detik.com/sport-lain/indeks"],
        "liga-inggris": ["https://sport.detik.com/sepakbola/liga-inggris/indeks"],
        "liga-indonesia": ["https://sport.detik.com/sepakbola/liga-indonesia/indeks"],
        "liga-spanyol": ["https://sport.detik.com/sepakbola/liga-spanyol/indeks"],
        "liga-italia": ["https://sport.detik.com/sepakbola/liga-italia/indeks"]
    }
}

In [None]:
def scrap_detik_worker(acc: dict[str, Any], url: str, label: str, news: str, filter: Filter = Filter()):
    scrapper = Scrapper()
    
    try:
        page = scrapper.scrap_url(url)
        links = filter.detik_index(page)
    except Exception as e:
        print(f"Failed to scrape index {url}: {e}")
        return  
    
    local_texts = []
    local_links = []
    
    
    def scrap_link(link):
        
        status = 'fail'
        retries = 3
        
        while status == 'fail' and retries > 0:
            try:
                text = filter.detik_article(scrapper.scrap_url(link))
                local_texts.append(text)
                local_links.append(link)
                status = 'ok'
            
            
            except Exception as e: 
                retries -= 1
                time.sleep(1) 
                if retries == 0:
                    print(f"Failed to scrape article {link} after 3 attempts.")
    
    threads = []
    for link in tqdm(links, desc=f"Scraping {label}"):
        t = threading.Thread(target=scrap_link, args=(link,) )
        t.start()
        threads.append(t)
    
    for t in threads:
        t.join()
        
    num_new_results = len(local_links)
    
    if num_new_results > 0:
        with lock:
            acc['text'].extend(local_texts)
            acc['link'].extend(local_links)
            acc['news'].extend([news] * num_new_results)
            acc['label'].extend([label] * num_new_results)

In [None]:
def scrap_liputan6_worker(acc: dict[str, Any], url: str, label: str, news: str, filter: Filter = Filter()):
    scrapper = Scrapper()
    
    try:
        page = scrapper.scrap_url(url)
        links = filter.liputan6_index(page)
    except Exception as e:
        print(f"Failed to scrape index {url}: {e}")
        return  

    local_texts = []
    local_links = []
    
    def scrap_link(link):
        status = 'fail'
        retries = 3
        
        while status == 'fail' and retries > 0:
            try:
                text = filter.liputan6_article(scrapper.scrap_url(link))
                local_texts.append(text)
                local_links.append(link)
                status = 'ok'
            
            
            except Exception as e: 
                retries -= 1
                time.sleep(1) 
                if retries == 0:
                    print(f"Failed to scrape article {link} after 3 attempts.")
    
    threads = []
    for link in tqdm(links, desc=f"Scraping {label}"):
        t = threading.Thread(target=scrap_link, args=(link,) )
        t.start()
        threads.append(t)

    for t in threads:
        t.join()
    
    num_new_results = len(local_links)
    
    if num_new_results > 0:
        with lock:
            acc['text'].extend(local_texts)
            acc['link'].extend(local_links)
            acc['news'].extend([news] * num_new_results)
            acc['label'].extend([label] * num_new_results)

In [None]:
acc_detik = dict()

acc_detik['text'] = list()
acc_detik['link'] = list()
acc_detik['news'] = list()
acc_detik['label'] = list()

news = 'detik'

threads: list[Thread] = []

for label in LINKS[news]:
    print(label)
    
    for url in LINKS[news][label]:
        
        for i in range(1, 9):
            t = threading.Thread(target=scrap_detik_worker, args=(acc_detik, f"{url}?page={i}", label, news, filter))
            t.start()
            threads.append(t)
        
for t in threads:
    t.join()


non-sepakbola
liga-inggris
liga-indonesia
liga-spanyol
liga-italia



Scraping non-sepakbola:   0%|          | 0/40 [00:00<?, ?it/s]
[A

Scraping non-sepakbola:   5%|▌         | 2/40 [00:01<00:03, 10.46it/s]
Scraping non-sepakbola:  10%|█         | 4/40 [00:01<00:14,  2.41it/s]


[A[A[A



[A[A[A[A





[A[A[A[A[A[A




[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A

[A[A








[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A
[A











[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















Scraping non-sepakbola:  12%|█▎        | 5/40 [00:01<00:13,  2.65it/s]


[A[A[A





[A[A[A[A[A[A







[A[A[A[A[A[A[A[A




[A[A[A[A[

In [7]:
df = pd.DataFrame(acc_detik)
df.head()
df.to_csv('./detik-data.csv')

In [8]:
acc_liputan6 = dict()

acc_liputan6['text'] = list()
acc_liputan6['link'] = list()
acc_liputan6['news'] = list()
acc_liputan6['label'] = list()

news = 'liputan6'

threads: list[Thread] = []

for label in LINKS[news]:
    print(label)
    
    for url in LINKS[news][label]:
        
        for i in range(1, 9):
            t = threading.Thread(target=scrap_liputan6_worker, args=(acc_liputan6, f"{url}?page={i}", label, news, filter))
            t.start()
            threads.append(t)
        

for t in threads:
    t.join()


non-sepakbola
liga-inggris
liga-indonesia
liga-spanyol
liga-italia


Scraping non-sepakbola:  71%|███████▏  | 15/21 [00:00<00:00, 123.72it/s]
Scraping non-sepakbola: 100%|██████████| 21/21 [00:03<00:00,  6.95it/s] 
Scraping non-sepakbola:   0%|          | 0/21 [00:00<?, ?it/s]

[A[A





[A[A[A[A[A[A



[A[A[A[A




[A[A[A[A[A


[A[A[A






[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
[A

Scraping non-sepakbola:   5%|▍         | 1/21 [00:00<00:02,  7.11it/s]











[A[A[A[A[A[A[A[A

Failed to scrape article https://www.liputan6.com/news/read/5409214/satgas-anti-mafia-bola-bongkar-praktik-pengaturan-skor-liga-2-empat-wasit-jadi-tersangka after 3 attempts.


[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A




[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A





[A[A[A[A[A[A


[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A






[A[A[A[A[A[A[A

Scraping non-sepakbola:  52%|█████▏    | 11/21 [01:00<01:01,  6.18s/it]
[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A



[A[A[A[A








[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A

In [9]:
df = pd.DataFrame(acc_liputan6)
df.head()
df.to_csv('./liputan6-data.csv')