In [2]:
# scrape_code.ipynb
import csv
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from requests.exceptions import RequestException
import time

In [49]:
base_url = "https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans" 

def extract_words_from_url(url):
    try:
        session = requests.Session()

        # retry logic
        retry = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)

        # setup a header to mimic a browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }

        # make the request to the URL
        response = session.get(url, headers=headers)
        response.raise_for_status()

        # parse the response
        soup = BeautifulSoup(response.text, 'html.parser')

        # extract the words from the response
        authors = []
        dates = []
        messages = []

        comments = soup.find_all('li', class_ = 'Item')

        for comment in comments:

            # extract messages
            message_div = comment.find('div', class_='Message userContent')
            message = message_div.text.strip() if message_div else None
            messages.append(message)

            # extract authors
            author_span = comment.find('a', class_='Username')
            author = author_span.text.strip() if author_span else None
            authors.append(author)
            
            # extract dates
            date_span = comment.find('time')
            date = date_span['datetime'] if date_span else None
            dates.append(date)
    
    except RequestException as e:
        print(f"Error scraping {url}: {e}")  # Log the error message
        return ["Error"]
    
    finally:
        time.sleep(1)

    return authors, dates, messages

# function to write to csv
def write_to_csv(authors, dates, messages):
    with open('comments.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["user-id", "date", "message"])
        for author, date, message in zip(authors, dates, messages):
            writer.writerow([author, date, message])
    print("Data successfully written to 'comments.csv'.")
    

def main():
    # Initialize combined lists
    all_authors = []
    all_dates = []
    all_messages = []

    # Start scraping from page 1
    page = 1
    total_messages = 0

    while total_messages < 5000:
        url = base_url if page == 1 else f"{base_url}/p{page}"
        print(f"Scraping page {page}: {url}")

        # Extract data from the current page
        authors, dates, messages = extract_words_from_url(url)

        # Stop if no new messages are found (end of pages)
        if not messages:
            print("No more messages found. Stopping.")
            break

        # Add the messages to the combined list
        all_authors.extend(authors)
        all_dates.extend(dates)
        all_messages.extend(messages)

        # Update the total number of messages
        total_messages += len(messages)
        print(f"Total messages scraped so far: {total_messages}")

        # Increment page number for the next iteration
        page += 1

    # Write the first 5000 messages to CSV
    write_to_csv(all_authors[:5000], all_dates[:5000], all_messages[:5000])
    return all_authors, all_dates, all_messages


if __name__ == "__main__":
    main()

Scraping page 1: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans
Total messages scraped so far: 50
Scraping page 2: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p2
Total messages scraped so far: 100
Scraping page 3: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p3
Total messages scraped so far: 150
Scraping page 4: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p4
Total messages scraped so far: 200
Scraping page 5: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p5
Total messages scraped so far: 250
Scraping page 6: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p6
Total messages scraped so far: 300
Scraping page 7: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p7
Total messages sc