In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time

In [32]:
categories = []
links = []
descriptions = []

topics_list = []
topics_links = []
topics_content = []
associated_categories = []

In [6]:
def extract_categories(soup):
    rows = soup.select('tbody > tr')

    for row in rows:
        category_div = row.find('td', class_='category').find('div', itemprop='itemListElement')

        if category_div:
            # Extract category name
            heading_tag = category_div.find('h3').find('span', itemprop='name')
            if heading_tag:
                category_text = heading_tag.text.strip()
                categories.append(category_text)

            # Extract category link
            link_tag = category_div.find('meta', itemprop='url')
            if link_tag:
                category_link = link_tag['content']
                full_category_link = url + category_link
                links.append(full_category_link)

            # Extract category description
            description_tag = category_div.find('div', itemprop='description')
            description_text = description_tag.text.strip() if description_tag else 'No description available'
            descriptions.append(description_text)


In [31]:
# Function to visit each category link and scrape topics
def extract_topics():
    for category_text, full_category_link in zip(categories, links):
        sub_response = requests.get(full_category_link)  # Visit the category page
        sub_soup = BeautifulSoup(sub_response.content, 'html.parser')

        # Extract topics from the category page
        topics = sub_soup.select('td.main-link a.title')
        if topics:
            for topic in topics:
                topic_name = topic.text.strip()
                topic_link = topic['href']  # Extract topic link

                extract_content(topic_link)

                topics_list.append(topic_name)  # Store topic name
                topics_links.append(topic_link)  # Store topic link
                associated_categories.append(category_text)  # Store the associated category
        else:
            # Append 'N/A' when no topics are found
            topics_list.append('N/A')
            topics_links.append('N/A')
            associated_categories.append(category_text)  # Store the category even if no topics found

In [33]:
def extract_content(topic_link):
    response = requests.get(topic_link)
    soup = BeautifulSoup(response.content, 'html.parser')
    # Extract content inside 'topic-body'
    desc = soup.find('div', class_='topic-body')  # For debugging purposes, you can remove this later
    if desc:
        # Initialize cleaned_content as an empty string
        cleaned_content = ""

        # Iterate through all elements within the topic body
        for element in desc.contents:
            if element.name == 'a':  # If the element is a link
                link_text = element.get_text(strip=True)  # Get the link text
                link_href = element['href']  # Get the link URL
                cleaned_content += f"{link_text} ({link_href}) "  # Add link in the content
            else:
                # If it's not a link, get the text content
                cleaned_content += element.get_text(separator=' ', strip=True) + " "  # Add normal text

        # Clean up the final content by stripping excessive whitespace
        cleaned_content = ' '.join(cleaned_content.split())

        # Append cleaned content to topics_content
        topics_content.append(cleaned_content)
    else:
        topics_content.append('No topic body available')


In [34]:
# Main script
if __name__ == "__main__":
    # Load the initial page for category extraction
    url = 'https://gov.optimism.io/'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Step 1: Extract main categories
    extract_categories(soup)

    # Step 2: Visit each category and extract topics
    extract_topics()

In [None]:
topics_content

In [37]:
main_data = {
    'Main Category': categories,
    'Link': links,
    'Description': descriptions
}

sub_data = {
    'Topics': topics_list,
    'Topics Links': topics_links,
    'Topics description': topics_content,
    'Main Category': associated_categories
}

In [38]:
main_df = pd.DataFrame(main_data)
main_df.to_csv('main_categories.csv', index=False)

sub_df = pd.DataFrame(sub_data)
sub_df.to_csv('topics.csv', index=False)

print("Data has been saved to 'main_categories.csv' and 'topics.csv'.")

Data has been saved to 'main_categories.csv' and 'topics.csv'.


In [39]:
df = pd.read_csv("main_categories.csv")
df.head()

Unnamed: 0,Main Category,Link,Description
0,Get Started üå±,https://gov.optimism.io//c/get-started/67,Welcome to the Optimism Collective governance ...
1,Mission Grants üèπ,https://gov.optimism.io//c/mission-grants/69,How to get a grant from the Governance Fund an...
2,Delegates üèõ,https://gov.optimism.io//c/delegates/41,"Info and discussions on voting, delegation, an..."
3,Retro Funding üî¥,https://gov.optimism.io//c/retrofunding/46,Retroactive Public Goods Funding rounds inform...
4,Citizens üë•,https://gov.optimism.io//c/citizens/79,This category is for all things relating to Ci...


In [40]:
df = pd.read_csv("topics.csv")
df.head()

Unnamed: 0,Topics,Topics Links,Topics description,Main Category
0,How to Stay up to Date,https://gov.optimism.io/t/how-to-stay-up-to-da...,"system June 16, 2023, 11:17am 1 Governance Cal...",Get Started üå±
1,How to Navigate the Forum,https://gov.optimism.io/t/how-to-navigate-the-...,"system June 16, 2023, 10:29am 1 How to Get a G...",Get Started üå±
2,About the Optimism Collective,https://gov.optimism.io/t/about-the-optimism-c...,"system June 16, 2023, 10:08am 1 Welcome to the...",Get Started üå±
3,Working Constitution of the Optimism Collective,https://gov.optimism.io/t/working-constitution...,"system April 26, 2022, 1:10am 1 The Optimism C...",Get Started üå±
4,Governance Season Guides,https://gov.optimism.io/t/governance-season-gu...,"system June 16, 2023, 10:31am 1 Guide to Seaso...",Get Started üå±


In [41]:
# from google.colab import files

# files.download("main_categories.csv")
# files.download("topics.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>