# Jupiter Community Forum Scraper

This notebook contains an asynchronous web scraper using Playwright to:
- Fetch topic URLs from categories and tags
- Scrape individual topics for posts, user info, links, images, and tags
- Save results to JSON files



In [None]:
import asyncio
from playwright.async_api import async_playwright
import json
import os

# Constants
HELP_URL = 'https://community.jupiter.money/c/help/27'
TAGS_URL = 'https://community.jupiter.money/tags'



### Fetches all topic URLs by scrolling through the “Help” category page on Jupiter’s Discourse forum.



In [None]:
async def get_topic_urls_from_category(page):
    """
    Args:
        page: Playwright page instance
    Returns:
        set: Unique topic URLs from the help category
    """
    topic_urls = set()
    await page.goto(HELP_URL)

    try:
        await page.wait_for_selector("a.title", timeout=10000)
    except:
        print("No topics found in help category")
        return topic_urls

    previous_height = 0
    while True:
        # Scroll down
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        await asyncio.sleep(3)

        # Extract current topic URLs
        topics = await page.query_selector_all("a.title")
        for t in topics:
            href = await t.get_attribute("href")
            if href:
                if href.startswith('http'):
                    topic_urls.add(href)
                else:
                    topic_urls.add("https://community.jupiter.money" + href)

        # Check if new content was loaded
        current_height = await page.evaluate("document.body.scrollHeight")
        if current_height == previous_height:
            break
        previous_height = current_height
    return topic_urls

### Fetches all tag page URLs from the Jupiter Community forum.

In [None]:
async def get_tags(page):
    """
    Args:
        page: Playwright page instance
    Returns:
        list: Tag page URLs
    """
    try:
        await page.goto(TAGS_URL)
        await page.wait_for_selector(".discourse-tag.box", timeout=10000)
        tag_links = await page.query_selector_all(".discourse-tag.box")
        tag_urls = []

        for tag in tag_links:
            href = await tag.get_attribute("href")
            if href:
                if href.startswith('http'):
                    tag_urls.append(href)
                else:
                    tag_urls.append("https://community.jupiter.money" + href)

        return tag_urls
    except Exception as e:
        print(f"Error fetching tags: {e}")
        return []


### This function automates scrolling through each tag-specific page on the Jupiter Community forum to extract all associated topic URLs.

In [None]:
async def get_topic_urls_from_tags(page, tag_urls):
    """
    Args:
        page: Playwright page instance
        tag_urls: List of tag page URLs
    Returns:
        set: Unique topic URLs from all tag pages
    """
    topic_urls = set()
    for tag_url in tag_urls:
        print(f"Scrolling through tag: {tag_url}")
        await page.goto(tag_url)
        try:
            await page.wait_for_selector("a.title", timeout=10000)
        except:
            print(f"No topics found in tag: {tag_url}")
            continue

        previous_height = 0
        while True:
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await asyncio.sleep(2)

            topics = await page.query_selector_all("a.title")
            for t in topics:
                href = await t.get_attribute("href")
                if href:
                    if href.startswith("http"):
                        topic_urls.add(href)
                    else:
                        topic_urls.add("https://community.jupiter.money" + href)

            current_height = await page.evaluate("document.body.scrollHeight")
            if current_height == previous_height:
                break
            previous_height = current_height

    return topic_urls

### Extracts the name and title of the user who authored a forum post from a `.topic-body` element.

In [1]:
async def extract_user_info(topic_body_element):
    """
    Args:
        topic_body_element: Playwright element handle for the .topic-body
    Returns:
        dict: {'name': str, 'title': str}
    """
    user_obj = {'name': '', 'title': 'User'}
    try:
        name_el = await topic_body_element.query_selector('.first.full-name a')
        if name_el:
            user_obj['name'] = (await name_el.text_content()).strip()

        title_el = await topic_body_element.query_selector('.user-title')
        if title_el:
            user_obj['title'] = (await title_el.text_content()).strip()
    except Exception as e:
        print(f"Error extracting user info: {e}")
    return user_obj

The `scrape_topic` function is an **asynchronous web scraping utility** built with [Playwright](https://playwright.dev/). It scrapes structured data from a discussion topic page on a forum-like website (such as Discourse-based forums).

In [None]:
async def scrape_topic(page, url):
    """
    Args:
        page: Playwright page instance
        url: Topic URL
    Returns:
        dict: Scraped data with keys title, url, text, images, links, replies, posts, tags
    """
    try:
        await page.goto(url, timeout=30000)
        await page.wait_for_selector('.topic-body', timeout=15000)

        # Title extraction
        title = ''
        try:
            title_el = await page.wait_for_selector('.fancy-title', timeout=5000)
            title = await title_el.text_content()
        except:
            alt_el = await page.query_selector('h1')
            if alt_el:
                title = await alt_el.text_content()

        # Posts extraction
        topic_bodies = await page.query_selector_all('.topic-body')
        posts = []
        for idx, body in enumerate(topic_bodies):
            user_info = await extract_user_info(body)
            post_text = ''
            cooked = await body.query_selector('.cooked')
            if cooked:
                post_text = (await cooked.text_content()).strip()
            post_type = 'question' if idx == 0 else 'reply'
            if user_info['name'] and post_text:
                posts.append({
                    'user': user_info,
                    'text': post_text,
                    'post_type': post_type,
                    'post_index': idx
                })

        # Links and images
        links = [await a.get_attribute('href') for a in await page.query_selector_all('.topic-body .cooked a') if await a.get_attribute('href')]
        images = [await img.get_attribute('src') for img in await page.query_selector_all('.topic-body .cooked img') if await img.get_attribute('src')]

        # Tags
        tags = [await t.text_content() for t in await page.query_selector_all('.title-wrapper .discourse-tag.box')]

        # Separate question and replies
        question_text = next((p['text'] for p in posts if p['post_type']=='question'), '')
        replies = [{'user': p['user']['name'], 'text': p['text']} for p in posts if p['post_type']=='reply']

        return {'title': title.strip(), 'url': url, 'text': question_text, 'images': images,
                'links': links, 'replies': replies, 'posts': posts, 'tags': tags}
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

The `main` function is the asynchronous entry point for scraping Jupiter's community forum. It uses **Playwright** to automate browsing and extract FAQ-related data from discussion threads across both the "Help" category and tag-filtered views.

In [None]:
async def main(test_mode=False, max_pages=5):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
        page = await browser.new_page()
        page.set_viewport_size({"width": 1920, "height": 1080})
        await page.set_extra_http_headers({"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})

        help_urls = await get_topic_urls_from_category(page)
        tag_urls = await get_tags(page)
        tag_topic_urls = await get_topic_urls_from_tags(page, tag_urls)
        all_urls = list(help_urls.union(tag_topic_urls))
        if test_mode:
            all_urls = all_urls[:max_pages]

        results, failed = [], []
        for i, url in enumerate(all_urls):
            print(f"Scraping {i+1}/{len(all_urls)}: {url}")
            res = await scrape_topic(page, url)
            if res: results.append(res)
            else: failed.append(url)
            if i % 10 == 0: await asyncio.sleep(2)

        # Save outputs
        out_file = 'faq_data_test.json' if test_mode else 'faq_data_raw.json'
        fail_file = 'failed_urls_test.json' if test_mode else 'failed_urls.json'
        with open(out_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        with open(fail_file, 'w', encoding='utf-8') as f:
            json.dump(failed, f, ensure_ascii=False, indent=2)
        print(f"Saved {len(results)} topics to {out_file}")

# Entry point
TEST_MODE = False  # set to False to scrape all pages

# Run the scraper
asyncio.run(main(test_mode=TEST_MODE))