In [15]:
import asyncio
from playwright.async_api import async_playwright
import os
import getpass
import json
from urllib.parse import urlparse

LINKS_FILE = "filtered_post_links.txt"
OUTPUT_DIR = "knowledge"

async def scrape_post_data(page, url):
    await page.goto(url)
    # Wait for posts to load
    await page.wait_for_selector("article[data-post-id] div.cooked", timeout=30000)

    posts = await page.locator("article[data-post-id]").all()
    thread_data = {
        "url": url,
        "main_post": {},
        "replies": []
    }

    for i, post in enumerate(posts):
        post_id = await post.get_attribute("data-post-id")
        content = await post.locator("div.cooked").inner_text()
        timestamp = await post.locator("span.relative-date").inner_text()

        # Extract all links in the post content
        links = await post.locator("div.cooked a").all_inner_texts()

        post_data = {
            "post_id": post_id, 
            "content": content.strip(),
            "date": timestamp,
            "links": links
        }

        if i == 0:
            thread_data["main_post"] = post_data
        else:
            thread_data["replies"].append(post_data)

    return thread_data

async def main():
    email = input("Enter your IITM email: ")
    password = getpass.getpass("Enter your IITM password (hidden): ")

    # Read all URLs from the links file
    with open(LINKS_FILE, "r") as f:
        urls = [line.strip() for line in f if line.strip()]

    # Create output directory if not exists
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context()
        page = await context.new_page()

        # Login step using the first URL
        await page.goto(urls[0])
        await page.fill('#login-account-name', email)
        await page.fill('#login-account-password', password)
        await page.click('#login-button')
        await page.wait_for_url(urls[0], timeout=20000)
        print("✅ Logged in.")

        # Loop through all URLs and scrape
        for url in urls:
            print(f"🔄 Scraping: {url}")
            try:
                thread_data = await scrape_post_data(page, url)

                # Extract thread ID from URL for filename
                path = urlparse(url).path.strip("/").split("/")
                thread_id = path[1] if len(path) > 1 else "unknown"
                out_file = os.path.join(OUTPUT_DIR, f"thread_{thread_id}.json")

                with open(out_file, "w", encoding="utf-8") as f:
                    json.dump(thread_data, f, indent=2, ensure_ascii=False)

                print(f"✅ Saved to {out_file}")

            except Exception as e:
                print(f"❌ Failed to scrape {url}: {e}")

        await browser.close()

# Run the async main function (use this if running in a script or Jupyter)
if __name__ == "__main__":
    await main()



Enter your IITM email:  sapta
Enter your IITM password (hidden):  ········


✅ Logged in.
🔄 Scraping: https://discourse.onlinedegree.iitm.ac.in/t/about-the-tools-in-data-science-category/23335/42
✅ Saved to knowledge\thread_about-the-tools-in-data-science-category.json
🔄 Scraping: https://discourse.onlinedegree.iitm.ac.in/t/end-term-mock-tds-jan-25/172333/11
✅ Saved to knowledge\thread_end-term-mock-tds-jan-25.json
🔄 Scraping: https://discourse.onlinedegree.iitm.ac.in/t/pyq-haversine/172546/1
✅ Saved to knowledge\thread_pyq-haversine.json
🔄 Scraping: https://discourse.onlinedegree.iitm.ac.in/t/what-to-do-if-peer-has-not-allowed-access-and-the-deadline-is-over-for-peer-review-in-project-2/172471/4
✅ Saved to knowledge\thread_what-to-do-if-peer-has-not-allowed-access-and-the-deadline-is-over-for-peer-review-in-project-2.json
🔄 Scraping: https://discourse.onlinedegree.iitm.ac.in/t/project-1-not-submitted-issue/172497/2
✅ Saved to knowledge\thread_project-1-not-submitted-issue.json
🔄 Scraping: https://discourse.onlinedegree.iitm.ac.in/t/graded-assignment-6/169283/4