# Scraping Text Data from Fiat Forums


In [3]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting beautifulsoup4 (from bs4)
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
  Downloading soupsieve-2.5-py3-none-any.whl.metadata (4.7 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
   ---------------------------------------- 0.0/147.9 kB ? eta -:--:--
   ---------- ---------------------------- 41.0/147.9 kB 991.0 kB/s eta 0:00:01
   ---------------------------------------- 147.9/147.9 kB 2.9 MB/s eta 0:00:00
Downloading soupsieve-2.5-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.12.3 bs4-0.0.2 soupsieve-2.5



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: C:\Users\zekai\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


This code scrapes forum posts and comments from the Fiat Forum, starting from a specific search results page. It navigates through multiple pages, extracting the title, content, comments, and comment timestamps from each post. The extracted data is saved into a CSV file. The scraping process is limited to a maximum of 50 pages.

In [20]:
import requests
from bs4 import BeautifulSoup
import csv

# Forum search results homepage URL
base_url = "https://www.fiatforum.com"
search_url = "/search/887488/?q=fiat+500+uk&o=relevance"

# Set request headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Initial page URL
current_url = base_url + search_url

# Create a CSV file and save it to the specified path
file_path = 'E:/ARP/forum_data_50_pages.csv'
with open(file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Title", "Content", "Comment", "Comment Time"])

    # Limit the number of pages to scrape
    max_pages = 50
    current_page = 1

    while current_url and current_page <= max_pages:
        print(f"Processing page URL: {current_url}")
        response = requests.get(current_url, headers=headers)
        print(f"Page response status code: {response.status_code}")
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all post links
        post_links = soup.find_all("a", href=True)
        post_links = [link for link in post_links if '/threads/' in link['href'] and 'title' not in link['href']]

        if not post_links:
            print("No post links found.")
            break
        
        print("Found post links:")
        for link in post_links:
            post_url = link.get('href')
            full_post_url = base_url + post_url if post_url.startswith('/') else post_url
            print(f"Processing post URL: {full_post_url}")
            post_response = requests.get(full_post_url, headers=headers)
            print(f"Post page response status code: {post_response.status_code}")
            post_soup = BeautifulSoup(post_response.text, 'html.parser')

            # Extract post title
            title_tag = post_soup.find("h1", class_="p-title-value")
            title = title_tag.get_text().strip() if title_tag else "No title"
            print(f"Title: {title}")

            # Extract post content
            content_tag = post_soup.find("div", class_="bbWrapper")
            content = content_tag.get_text().strip() if content_tag else "No content"
            print(f"Content: {content}")

            # Extract comments and timestamps
            comments = post_soup.find_all("div", class_="bbWrapper")
            time_tags = post_soup.find_all("time", class_="u-dt")
            for comment, time_tag in zip(comments, time_tags):
                comment_text = comment.get_text().strip()
                comment_time = time_tag.get('datetime', 'No time')
                if comment_text:
                    writer.writerow([title, content, comment_text, comment_time])

        # Find the next page link
        next_page = soup.find("a", class_="pageNav-jump pageNav-jump--next")
        if next_page:
            current_url = base_url + next_page.get('href')
            current_page += 1
        else:
            current_url = None

print("Scraping complete.")


Processing page URL: https://www.fiatforum.com/search/887488/?q=fiat+500+uk&o=relevance
Page response status code: 200
Found post links:
Processing post URL: https://www.fiatforum.com/threads/new-fiat-500-confused-from-uk.508139/
Post page response status code: 200
Title: General New Fiat 500! Confused from UK!
Content: I have just leased a new Fiat 500 hybrid. The manual is pants! Going to need help - TIA
Processing post URL: https://www.fiatforum.com/threads/window-switch-wiring-on-uk-fiat-500-s3.500202/
Post page response status code: 200
Title: Technical window switch wiring on UK fiat 500 S3
Content: Hello all,

I'm in the process of adding a UK spec. Biposto shifter and lower carbon dash to a US Spec. North American Fiat 500 Abarth. 

The Biposto kit uses the older S3 switch gear out of a Fiat 500 for the UK. and I'm trying to determine what wire on the switch does what? I think the easiest thing would be to decode an S3 switch, or the window wiring out of a UK based S4. The prio

This code scrapes forum posts and comments from the Fiat 500X Owners Club website. It starts from a specific search results page and navigates through multiple pages (up to 12), extracting the title, content, comments, and timestamps from each post. The extracted data is saved into a CSV file.

In [1]:
import requests
from bs4 import BeautifulSoup
import csv

# Forum search results homepage URL
base_url = "https://www.500xownersclub.co.uk"
search_url = "/search/544/?q=fiat+uk&o=relevance"

# Set request headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Initial page URL
current_url = base_url + search_url

# Create a CSV file and save it to the specified path
file_path = 'E:/ARP/forum_data_12_pages.csv'
with open(file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Title", "Content", "Comment", "Comment Time"])

    # Limit the number of pages to scrape
    max_pages = 12
    current_page = 1

    while current_url and current_page <= max_pages:
        print(f"Processing page URL: {current_url}")
        response = requests.get(current_url, headers=headers)
        print(f"Page response status code: {response.status_code}")
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all post links
        post_links = soup.find_all("a", href=True, qid="search-results-title")
        if not post_links:
            print("No post links found.")
            break
        
        print("Found post links:")
        for link in post_links:
            post_url = link.get('href')
            full_post_url = base_url + post_url if post_url.startswith('/') else post_url
            print(f"Processing post URL: {full_post_url}")
            post_response = requests.get(full_post_url, headers=headers)
            print(f"Post page response status code: {post_response.status_code}")
            post_soup = BeautifulSoup(post_response.text, 'html.parser')

            # Extract post title
            title_tag = post_soup.find("h1", class_="p-title-value")
            title = title_tag.get_text().strip() if title_tag else "No title"
            print(f"Title: {title}")

            # Extract post content
            content_tag = post_soup.find("div", class_="bbWrapper", itemprop="text")
            content = content_tag.get_text().strip() if content_tag else "No content"
            print(f"Content: {content}")

            # Extract comments and timestamps
            comments = post_soup.find_all("div", class_="bbWrapper", itemprop="text")
            time_tags = post_soup.find_all("time", class_="u-dt")
            for comment, time_tag in zip(comments, time_tags):
                comment_text = comment.get_text().strip()
                comment_time = time_tag.get('datetime', 'No time')
                if comment_text:
                    writer.writerow([title, content, comment_text, comment_time])

        # Find the next page link
        next_page = soup.find("a", class_="pageNav-jump pageNav-jump--next")
        if next_page:
            current_url = base_url + next_page.get('href')
            current_page += 1
        else:
            current_url = None

print("Scraping complete.")



Processing page URL: https://www.500xownersclub.co.uk/search/544/?q=fiat+uk&o=relevance
Page response status code: 409
No post links found.
Scraping complete.
