In [1]:
import asyncio
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError, Playwright

from IPython.display import clear_output
from queue import Queue
import time
import pandas
import os.path
import random

# Global variabel
url = 'https://www.detik.com/search/searchnews?query=jokowi&page=39&result_type=relevansi&fromdatex=20/10/2014&todatex=20/10/2024'
total_extracted_articles_in_current_page = 0
total_extracted_pages = 38

async def get_all_article_selectors_in_current_page(page):
    article_selector_queues = Queue()
    
    try:
        article_selectors = await page.locator('xpath=//div[contains(@class, "list-content")]/article[contains(@class, "list-content__item")]').all()

        for article_selector in article_selectors:
            article_selector_queues.put(article_selector)
    except Exception as e:
        print(f'Error occurred while get all article selectors in page {total_extracted_pages + 1}: {e}')
    
    return article_selector_queues

async def scroll_to_and_click_article(article_selector):
    await article_selector.scroll_into_view_if_needed()
    await article_selector.click()
    
async def extract_article_data(page):
    global total_extracted_articles_in_current_page
    
    article_data = {
        'article_title' : '',
        'article_author': '',
        'article_publication_date' : '',
        'article_content' : '',
    }
    
    try:
        await page.wait_for_load_state()
        
        article_title_selector = page.locator('xpath=//h1[contains(@class, "detail__title")]').first
        article_author_selector = page.locator('xpath=//div[@class="detail__author"]|//div[@class="detail"]/div[2]').first
        article_publication_date_selector = page.locator('xpath=//div[contains(@class, "detail__date")]').first 
        article_content_selectors = await page.locator('xpath=//div[contains(@class, "detail__body-text")]/p').all()
    
        # Joining all article content
        article_content = ''
        for article_content_selector in article_content_selectors:
            ac = await article_content_selector.inner_text()
            article_content += '\n' + ac.replace('\n', '').strip()
        
        article_data['article_title'] = (await article_title_selector.inner_text()).strip()
        article_data['article_author'] = (await article_author_selector.inner_text()).strip()
        article_data['article_publication_date'] = (await article_publication_date_selector.inner_text()).strip()
        # Remove leading and trailing newline characters from the article content
        article_data['article_content'] = article_content.strip()

        total_extracted_articles_in_current_page += 1
    except Exception as e:
        print(f'Error occurred while get article data at page {total_extracted_pages + 1} in article number {total_extracted_articles_in_current_page + 1}: {e}')
    
    # Sleep for a second and then go back to previous page
    time.sleep(random.randint(5, 10))
    await page.go_back()

    return article_data

def store_extracted_article_data_to_csv(article_data):
    dataframe = pandas.DataFrame([article_data])

    try:
        if os.path.isfile('./detikcom_unprocessed_news_data.csv') == True:
            dataframe.to_csv('detikcom_unprocessed_news_data.csv', sep='\t', encoding='utf-8', index=False, header=False, mode='a')
        else:
            dataframe.to_csv('detikcom_unprocessed_news_data.csv', sep='\t', encoding='utf-8', index=False, header=True)
    except Exception as e:
        print(f'Error occured while saved article data to csv at page {total_extracted_pages + 1} in article number {total_extracted_articles_in_current_page + 1}: {e}')
    

async def main():
    global total_extracted_articles_in_current_page, total_extracted_pages, url
    
    try:
        async with async_playwright() as p:
            # For test purpose only
            # browser = await p.firefox.launch(headless=False, slow_mo=100)
            
            browser = await p.firefox.launch(slow_mo=100)
            page = await browser.new_page()
            await page.goto(url)
            await page.wait_for_load_state()
            time.sleep(random.randint(2, 5))
            
            article_selector_queues = await get_all_article_selectors_in_current_page(page)

            while total_extracted_pages <= 1000:
                print(f'Scraping article number {total_extracted_articles_in_current_page + 1} at page {total_extracted_pages + 1}')
                await page.wait_for_load_state()
                
                if article_selector_queues.empty() != True:
                    await scroll_to_and_click_article(article_selector_queues.get())
                    time.sleep(random.randint(2, 5))
                    
                    article_data = await extract_article_data(page)
                    wait_time_after_done_scraping_article = random.randint(10, 15)
                    print(f'Done scraping article and waiting for {wait_time_after_done_scraping_article} seconds')
                    time.sleep(wait_time_after_done_scraping_article)
                    
                    store_extracted_article_data_to_csv(article_data)
                    time.sleep(random.randint(2, 5))
                else:
                    clear_output(wait=True)
                    wait_time_after_done_scraping_all_articles_at_page = random.randint(20, 30)
                    print(f'Done scraping all articles at page {total_extracted_pages + 1}\nWaiting for {wait_time_after_done_scraping_all_articles_at_page} seconds')
                    time.sleep(wait_time_after_done_scraping_all_articles_at_page)
                    
                    # Get the next button page, then scroll to that button and click
                    new_page_button_html_selector = page.locator(
                        'xpath=//div[contains(@class, "pagination")]/a[contains(@class, "pagination__item")]').last
                    await new_page_button_html_selector.scroll_into_view_if_needed()
                    await new_page_button_html_selector.click()
                    await page.wait_for_load_state()

                    article_selector_queues = await get_all_article_selectors_in_current_page(page)
                    
                    total_extracted_articles_in_current_page = 0
                    total_extracted_pages += 1
    except Exception as e:
        print(f'Error occur when scraping and please see scraping_summary.json file: {e}')
        await browser.close()
        
        dataframe_scraping_summary = pandas.DataFrame([{
            'total_extracted_articles_in_current_page' : total_extracted_articles_in_current_page,
            'total_extracted_pages': total_extracted_pages,
        }])
    
        dataframe_scraping_summary.to_json('detikcom_scraping_summary.json', orient='records', lines=True)

        total_extracted_articles_in_current_page = 0
        
        # Restarting scraping
        wait_time_before_restrart = random.randint(20, 30)
        print(f'Restarting scraping in {wait_time_before_restrart} seconds')
        time.sleep(wait_time_before_restrart)
        
        # Skip article at error page and continue
        total_extracted_pages += 1
        url = f'https://www.detik.com/search/searchnews?query=jokowi&page={total_extracted_pages}&result_type=relevansi&fromdatex=20/10/2014&todatex=20/10/2024'
        await main()
    finally:
        dataframe_scraping_summary = pandas.DataFrame([{
            'total_extracted_articles_in_current_page' : total_extracted_articles_in_current_page,
            'total_extracted_pages': total_extracted_pages,
        }])
    
        dataframe_scraping_summary.to_json('detikcom_scraping_summary.json', orient='records', lines=True)

        print(f'Scraping stopped and please see scraping_summary.json file')
        await browser.close()
        
if __name__ == "__main__":
    await main()

Done scraping all articles at page 98
Waiting for 23 seconds
Scraping article number 1 at page 99
Done scraping article and waiting for 15 seconds
Scraping article number 2 at page 99
Done scraping article and waiting for 15 seconds
Scraping article number 3 at page 99
Error occur when scraping and please see scraping_summary.json file: Page.go_back: Timeout 30000ms exceeded.
Call log:
  - waiting for navigation until "load"

Restarting scraping in 24 seconds
Scraping article number 1 at page 100
Done scraping article and waiting for 13 seconds
Scraping article number 2 at page 100
Done scraping article and waiting for 12 seconds
Scraping article number 3 at page 100
Done scraping article and waiting for 11 seconds
Scraping article number 4 at page 100
Done scraping article and waiting for 15 seconds
Scraping article number 5 at page 100
Error occur when scraping and please see scraping_summary.json file: Page.go_back: Timeout 30000ms exceeded.
Call log:
  - waiting for navigation unti

CancelledError: 