In [None]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Configuration
search_terms = "krisis iklim"  # Modify this string to use different search terms
site_id = 2  # Adjust site ID if necessary
sort_by = 'time'  # Can be changed to other sorting parameters like relevance, etc.
max_pages_per_run = 100  # Maximum pages to scrape per run
total_pages = 161  # Total number of pages to scrape
start_page = 100  # Starting page

# Create URL and headers
base_url = f'https://www.detik.com/search/searchall?query={search_terms}&siteid={site_id}&sortby={sort_by}&page='
headers = {
    "User-Agent": "ResearchScraper (email): This bot is conducting research on climate change coverage in Indonesia for non-profit academic purposes."
}

# Prepare data collection
data = []
pages_scraped = 0
empty_pages_in_a_row = 0

# Scraping loop
for i in range(start_page, start_class + min(max_pages_per_run, total_pages)):
    if empty_pages_in_a_row >= 3:  # Stop if 3 consecutive empty pages are encountered
        print("No more articles found. Stopping.")
        break

    print(f"Scraping page {i}...")
    url = base_url + str(i)

    try:
        # Fetch the page
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract articles
        articles = soup.find_all('article')
        if not articles:
            print(f"No articles on page {i}.")
            empty_pages_in_a_row += 1
            continue

        empty_pages_in_a_row = 0  # Reset the counter if the page has articles
        pages_scraped += 1

        # Process each article
        for article in articles:
            title = article.find('h2', class_='title').get_text(strip=True)
            link = article.find('a')['href']
            description = article.find('p').get_text(strip=True)
            date = article.find('span', class_='date').get_text(strip=True)

            # Fetch the full article content
            article_response = requests.get(link, headers=headers)
            article_response.raise_for_status()
            article_soup = BeautifulSoup(article_response.content, 'html.parser')
            article_div = article_soup.find('div', class_='detail__body-text itp_bodycontent')
            article_text = article_div.get_text(strip=True) if article_div else "N/A"

            # Save the extracted data
            data.append([title, link, description, date, article_text])

    except Exception as e:
        print(f"Error while scraping page {i}: {e}")

# Create DataFrame
column_names = ['Title', 'Link', 'Description', 'Date', 'Article']
df = pd.DataFrame(data, columns=column_names)
print(df)

# Save data to CSV
filename = f"Data_{search_terms.replace(' ', '_')}_{start_page}_{start_page+pages_scraped-1}.csv"
df.to_csv(filename, index=False)
print(f"Data saved to {filename}")
