In [65]:
import requests
from bs4 import BeautifulSoup
import csv
import random
import time

# List of User-Agent headers to rotate
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.110 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.110 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.110 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1",
]

# Base URL for pagination
BASE_URL = "https://www.ft.com/oil?page={}"

# Function to randomize headers
def get_random_headers():
    return {
        "User-Agent": random.choice(USER_AGENTS),
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
    }

In [66]:
# Open a CSV file to store the results
with open("ft_oil_articles.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Headline", "Date"])  # Write header row

    # Loop through pages 2 to 131
    for page in range(2, 60):
        url = BASE_URL.format(page)
        headers = get_random_headers()  # Use randomized headers
        print(f"Scraping page {page}: {url} with headers {headers['User-Agent']}")
        
        try:
            # Send a GET request with randomized headers
            response = requests.get(url, headers=headers, timeout=10)

            # Check for successful response
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")

                # Find all articles on the page
                articles = soup.find_all("li", class_="o-teaser-collection__item o-grid-row")
                for article in articles:
                    try:
                        # Extract the date
                        date_tag = article.find("time", class_="o-date")
                        date = date_tag.text.strip() if date_tag else "No date"

                        # Extract the headline
                        headline_tag = article.find("a", class_="js-teaser-heading-link")
                        headline = headline_tag.text.strip() if headline_tag else "No headline"

                        # Write to CSV
                        writer.writerow([headline, date])
                    except Exception as e:
                        print(f"Error extracting data for an article: {e}")

            else:
                print(f"Failed to fetch page {page}. Status code: {response.status_code}")

            # Add a random delay to mimic human browsing
            time.sleep(random.uniform(2, 5))

        except requests.exceptions.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            time.sleep(5)  # Pause before retrying the next page

print("Scraping completed. Data saved to 'ft_oil_articles.csv'.") 

Scraping page 2: https://www.ft.com/oil?page=2 with headers Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1
Scraping page 3: https://www.ft.com/oil?page=3 with headers Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.110 Safari/537.36
Scraping page 4: https://www.ft.com/oil?page=4 with headers Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.110 Safari/537.36
Scraping page 5: https://www.ft.com/oil?page=5 with headers Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.110 Safari/537.36
Scraping page 6: https://www.ft.com/oil?page=6 with headers Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1
Scraping page 7: https://www.ft.com/oil?page=7 with headers Mozilla/5.0 (iPhone; CP

# Scrape part 2

In [74]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import random

# Function to generate random headers
def get_random_headers():
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    ]
    return {"User-Agent": random.choice(user_agents)}

# Define the base URL
BASE_URL = "https://www.ft.com/oil?page={}"

In [75]:
# Open a CSV file to store the results
with open("ft_oil_articles_part_2.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Headline", "Date"])  # Write header row

    # Loop through pages 2 to 131
    for page in range(60, 132):
        url = BASE_URL.format(page)
        headers = get_random_headers()  # Use randomized headers
        print(f"Scraping page {page}: {url} with headers {headers['User-Agent']}")
        
        try:
            # Send a GET request with randomized headers
            response = requests.get(url, headers=headers, timeout=10)

            # Check for successful response
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")

                # Find all articles on the page
                articles = soup.find_all("li", class_="o-teaser-collection__item o-grid-row")
                for article in articles:
                    try:
                        # Extract the date
                        date_tag = article.find("time", class_="o-date")
                        date = date_tag.text.strip() if date_tag else "No date"

                        # Extract the headline
                        headline_tag = article.find("a", class_="js-teaser-heading-link")
                        headline = headline_tag.text.strip() if headline_tag else "No headline"

                        # Write to CSV
                        writer.writerow([headline, date])
                    except Exception as e:
                        print(f"Error extracting data for an article: {e}")

            else:
                print(f"Failed to fetch page {page}. Status code: {response.status_code}")

            # Add a random delay to mimic human browsing
            time.sleep(random.uniform(2, 5))

        except requests.exceptions.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            time.sleep(5)  # Pause before retrying the next page

print("Scraping completed") 

Scraping page 60: https://www.ft.com/oil?page=60 with headers Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36
Scraping page 61: https://www.ft.com/oil?page=61 with headers Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36
Scraping page 62: https://www.ft.com/oil?page=62 with headers Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36
Scraping page 63: https://www.ft.com/oil?page=63 with headers Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36
Scraping page 64: https://www.ft.com/oil?page=64 with headers Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36
Scraping page 65: https://www.ft.com/oil?page=65 with headers Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTM

KeyboardInterrupt: 

In [80]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import random

# Function to generate random headers
def get_random_headers():
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    ]
    return {"User-Agent": random.choice(user_agents)}

# Define the base URL
BASE_URL = "https://www.ft.com/oil?page={}"

In [81]:
# Open a CSV file to store the results
with open("ft_oil_articles_part_3.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Headline", "Date"])  # Write header row

    # Loop through pages 2 to 131
    for page in range(119, 132):
        url = BASE_URL.format(page)
        headers = get_random_headers()  # Use randomized headers
        print(f"Scraping page {page}: {url} with headers {headers['User-Agent']}")
        
        try:
            # Send a GET request with randomized headers
            response = requests.get(url, headers=headers, timeout=10)

            # Check for successful response
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")

                # Find all articles on the page
                articles = soup.find_all("li", class_="o-teaser-collection__item o-grid-row")
                for article in articles:
                    try:
                        # Extract the date
                        date_tag = article.find("time", class_="o-date")
                        date = date_tag.text.strip() if date_tag else "No date"

                        # Extract the headline
                        headline_tag = article.find("a", class_="js-teaser-heading-link")
                        headline = headline_tag.text.strip() if headline_tag else "No headline"

                        # Write to CSV
                        writer.writerow([headline, date])
                    except Exception as e:
                        print(f"Error extracting data for an article: {e}")

            else:
                print(f"Failed to fetch page {page}. Status code: {response.status_code}")

            # Add a random delay to mimic human browsing
            time.sleep(random.uniform(2, 5))

        except requests.exceptions.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            time.sleep(5)  # Pause before retrying the next page

print("Scraping completed") 

Scraping page 119: https://www.ft.com/oil?page=119 with headers Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36
Scraping page 120: https://www.ft.com/oil?page=120 with headers Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36
Scraping page 121: https://www.ft.com/oil?page=121 with headers Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36
Scraping page 122: https://www.ft.com/oil?page=122 with headers Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36
Scraping page 123: https://www.ft.com/oil?page=123 with headers Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36
Scraping page 124: https://www.ft.com/oil?page=124 with headers Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, l