In [8]:
import requests
from bs4 import BeautifulSoup
import csv
import random
import time


# Define the base URL and headers
BASE_URL = "https://www.arabnews.com/search/site/%2522crude%2520oil%2520price%2522"
PAGE_URL = "https://www.arabnews.com/search/site/%2522crude%2520oil%2520price%2522?page={}"

# List of User-Agent headers to rotate
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.110 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.110 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.110 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1",
]


# Function to randomize headers
def get_random_headers():
    return {
        "User-Agent": random.choice(USER_AGENTS),
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
    }

In [9]:
# Function to scrape a single page
def scrape_page(url, writer):
    print(f"Scraping: {url}")
    headers = get_random_headers()
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all article items
        articles = soup.find_all("div", class_="article-item")

        for article in articles:
            try:
                # Extract the title
                title_tag = article.find("div", class_="article-item-title").find("a")
                title = title_tag.text.strip() if title_tag else "No title"

                # Extract the date
                date_tag = article.find("time")
                date = date_tag.text.strip() if date_tag else "No date"

                # Write to CSV
                writer.writerow([title, date])
            except Exception as e:
                print(f"Error extracting data for an article: {e}")
    else:
        print(f"Failed to fetch {url}. Status code: {response.status_code}")

# Open a CSV file to save the results
with open("arabnews_articles.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Headline", "Date"])  # Write the header row

    # Scrape the first page without the page parameter
    scrape_page(BASE_URL, writer)

    # Scrape pages 1 to 22
    for page in range(1, 23):
        scrape_page(PAGE_URL.format(page), writer)
        time.sleep(random.uniform(2, 5))  # Add a delay between requests to avoid overwhelming the server

print("Scraping completed. Data saved to 'arabnews_articles.csv'.")

Scraping: https://www.arabnews.com/search/site/%2522crude%2520oil%2520price%2522
Scraping: https://www.arabnews.com/search/site/%2522crude%2520oil%2520price%2522?page=1
Scraping: https://www.arabnews.com/search/site/%2522crude%2520oil%2520price%2522?page=2
Scraping: https://www.arabnews.com/search/site/%2522crude%2520oil%2520price%2522?page=3
Scraping: https://www.arabnews.com/search/site/%2522crude%2520oil%2520price%2522?page=4
Scraping: https://www.arabnews.com/search/site/%2522crude%2520oil%2520price%2522?page=5
Scraping: https://www.arabnews.com/search/site/%2522crude%2520oil%2520price%2522?page=6
Scraping: https://www.arabnews.com/search/site/%2522crude%2520oil%2520price%2522?page=7
Scraping: https://www.arabnews.com/search/site/%2522crude%2520oil%2520price%2522?page=8
Scraping: https://www.arabnews.com/search/site/%2522crude%2520oil%2520price%2522?page=9
Scraping: https://www.arabnews.com/search/site/%2522crude%2520oil%2520price%2522?page=10
Scraping: https://www.arabnews.com/sea