In [None]:
import os
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
 
BASE_URL = "https://discuss.huggingface.co/"
SAVE_DIR = "huggingface_forum"
os.makedirs(SAVE_DIR, exist_ok=True)

categories = {
    "Research": "https://discuss.huggingface.co/c/research/7",\
    "beginners": "https://discuss.huggingface.co/c/beginners/5", \
    "intermediate": "https://discuss.huggingface.co/c/intermediate/6", \
    "course": "https://discuss.huggingface.co/c/course/20", \
    "models": "https://discuss.huggingface.co/c/models/13", \
    "transformers": "https://discuss.huggingface.co/c/transformers/9", \
    "datasets": "https://discuss.huggingface.co/c/datasets/10", \
    "tokenizers":"https://discuss.huggingface.co/c/tokenizers/11", \
    "accelerate": "https://discuss.huggingface.co/c/accelerate/18", \
    "autotrain": "https://discuss.huggingface.co/c/autotrain/16", \
    "hub": "https://discuss.huggingface.co/c/hub/23", \
    "optimum": "https://discuss.huggingface.co/c/optimum/59", \
    "gradio": "https://discuss.huggingface.co/c/gradio/26", \
    "diffusers": "https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63", \
    "inference-endpoints": "https://discuss.huggingface.co/c/inference-endpoints/64", \
    "sagemaker": "https://discuss.huggingface.co/c/sagemaker/17", \
    "aws-inferentia-trainium": "https://discuss.huggingface.co/c/aws-inferentia-trainium/66", \
    "azureml": "https://discuss.huggingface.co/c/azureml/68", \
    "google-cloud": "https://discuss.huggingface.co/c/google-cloud/69", \
    "spaces": "https://discuss.huggingface.co/c/spaces/24", \
    "model-card": "https://discuss.huggingface.co/c/model-cards/14", \
    "languages-at-hugging-face": "https://discuss.huggingface.co/c/languages-at-hugging-face/15", \
    "flax-jax-projects": "https://discuss.huggingface.co/c/flax-jax-projects/22", \
    "community-calls": "https://discuss.huggingface.co/c/community-calls/12", \
    "show-and-tell": "https://discuss.huggingface.co/c/show-and-tell/65", \
    "site-feedback": "https://discuss.huggingface.co/c/site-feedback/2", \
}

def scrape_topic_initial_post(topic_url):
    """Fetch the initial post content, date, and any responses from the topic page."""
    response = requests.get(topic_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Capture initial post content and date
    first_post = soup.select_one(".topic-body .post")
    initial_post = first_post.get_text(strip=True) if first_post else "Initial post not available"
    initial_post_date = first_post.find_previous("time").get("datetime") if first_post and first_post.find_previous("time") else "No date available"
    
    # Capture responses, skipping the first post
    responses = []
    reply_posts = soup.select(".topic-body")[1:]  # Skip the first post in the selection
    for post in reply_posts:
        date = post.select_one("time").get("datetime") if post.select_one("time") else "No date available"
        reply_text = post.select_one(".post").get_text(strip=True) if post.select_one(".post") else "No reply text available"
        
        responses.append({
            "date": date,
            "reply": reply_text
        })
    
    return initial_post, initial_post_date, responses

def scrape_topics(category_name, category_url):
    topics = []
    page = 1  # Start with the first page
    
    while True:
        paginated_url = f"{category_url}?page={page}"
        response = requests.get(paginated_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Check if there are any topics on the current page
        topic_elements = soup.select(".topic-list-item")
        if not topic_elements:
            break  # No more topics, exit the loop
        
        for topic in topic_elements:
            title_element = topic.select_one(".title.raw-link.raw-topic-link")
            if not title_element:
                continue

            title = title_element.get_text(strip=True)
            link = urljoin(BASE_URL, title_element["href"])
            
            replies = topic.select_one(".replies .posts")
            views = topic.select_one(".views .views")
            num_replies = int(replies.get_text(strip=True)) if replies else 0
            
            topic_data = {
                "title": title,
                "link": link,
                "replies": num_replies,
                "views": int(views.get_text(strip=True)) if views else None,
            }
            
            # Retrieve initial post, initial post date, and responses for each topic
            initial_post, initial_post_date, responses = scrape_topic_initial_post(link)
            topic_data["initial_post"] = initial_post
            topic_data["initial_post_date"] = initial_post_date
            topic_data["responses"] = responses if num_replies > 0 else []  # Only add responses if replies > 0

            topics.append(topic_data)
            
            # Pause to avoid overloading the server
            time.sleep(1)
        
        print(f"Page {page} processed for category '{category_name}'.")
        page += 1  # Move to the next page
    
    # Save to JSON file
    json_filename = os.path.join(SAVE_DIR, f"{category_name}.json")
    with open(json_filename, "w", encoding="utf-8") as json_file:
        json.dump(topics, json_file, indent=4)
    
    print(f"Saved {len(topics)} topics with initial post and responses for category '{category_name}' to {json_filename}")

# Run the scraping for each category
for category_name, category_url in categories.items():
    scrape_topics(category_name, category_url)
