In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np

# Configuration
search_terms = "perubahan iklim"  # Modify this string to use different search terms
base_url = f'https://www.kompas.com/tag/{search_terms.replace(" ", "+")}'
headers = {
    'User-Agent': 'ResearchScraper (taufik.impact@gmail.com): This bot is conducting research on climate change in Indonesia for non-profit academic purposes.'
}
start_page = 1  # Starting page
total_pages = 161  # Total pages available to scrape
max_pages_per_run = 50  # Maximum number of pages to scrape in one run

# Function to scrape the content of an article
def scrape_article_content(url):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Ensure the response is successful
        soup = BeautifulSoup(response.content, 'html.parser')
        article_content_div = soup.find('div', class_='read__content')
        if article_content_div:
            article_paragraphs = article_content_div.find_all('p')
            article_text = ' '.join(paragraph.get_text(strip=True) for paragraph in article_paragraphs)
            return article_text
        else:
            return "Article content not found"
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return np.nan

# Function to scrape a single page
def scrape_page(page_url):
    news_data = []
    try:
        response = requests.get(page_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        news_items = soup.find_all('div', class_='article__list')

        if not news_items:
            print("No news articles found on the page.")
            return news_data  # Returns an empty list if no articles are found

        for item in news_items:
            try:
                title_element = item.find('a', class_='article__link')
                date_element = item.find('div', class_='article__date')
                segment_element = item.find('div', class_='article__subtitle')

                title = title_element.text.strip() if title_element else 'Title not found'
                link = title_element['href'] if title_element else 'Link not found'
                date = date_element.text.strip() if date_element else 'Date not found'
                segment = segment_element.text.strip() if segment_element else 'N/A'

                article_content = scrape_article_content(link)
                news_data.append({'title': title, 'link': link, 'date': date, 'segment': segment, 'content': article_content})

            except Exception as e:
                print(f"Error processing news item: {e}")
                news_data.append({'title': np.nan, 'link': np.nan, 'date': np.nan, 'segment': np.nan, 'content': np.nan})
            
            time.sleep(1)  # Throttle requests to be polite to the server
    except Exception as e:
        print(f"Error scraping page: {e}")
    
    return news_data

# Function to scrape multiple pages within a range
def scrape_pages_in_range(start_page, max_pages_per_run):
    all_news_data = []
    for page in range(start_page, min(start_page + max_pages_per_run, total_pages + 1)):
        print(f"Scraping page {page}...")
        page_url = f"{base_url}?page={page}"
        page_data = scrape_page(page_url)
        if not page_data:  # If a page returns no data, assume subsequent pages might be empty and stop
            print("Stopping early due to consecutive empty pages.")
            break
        all_news_data.extend(page_data)
    return all_news_data

# Execute the scraping
news_data = scrape_pages_in_range(start_page, max_pages_per_run)

# Convert the list to a DataFrame
df_kompas = pd.DataFrame(news_data)

# Save the DataFrame to a CSV file
filename = f"kompas_{search_terms.replace(' ', '_')}_{start_page}_{start_page + len(news_data) // len(df_pi.columns) - 1}.csv"
df_kompas.to_csv(filename, index=False)

print(f"Scraped {len(df_kompas)} articles from pages {start_page} to {start_page + len(news_data) // len(df_pi.columns) - 1}.")
print(df_kompas.head())
