In [2]:
import json
from bs4 import BeautifulSoup
import re
import requests

# URL of the lecture notes
url = 'https://stanford-cs324.github.io/winter2022/lectures/introduction/'

# Fetch the content from the URL
response = requests.get(url)

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

main_content_div = soup.find('div', class_='main-content', id='main-content')

# Initialize a dictionary to store scraped data
scraped_data = {}

if main_content_div:
    # Extract all <h2> tags
    h2_sections = main_content_div.find_all('h2')
    
    for h2 in h2_sections:
        section_title = h2.text.strip()
        section_data = {
            'paragraphs': [],
            'tables': [],
            'links': [],
            'equations': [],
            'ordered_lists': [],
            'unordered_lists': []
        }
        
        # Extract paragraphs under current <h2>
        paragraphs = h2.find_next_siblings('p')
        for p in paragraphs:
            section_data['paragraphs'].append(p.text.strip())

        # Extract tables under current <h2>
        tables = h2.find_next_siblings('table')
        for table in tables:
            section_data['tables'].append(str(table))

        # Extract links under current <h2>
        links = h2.find_next_siblings('a')
        for link in links:
            section_data['links'].append({
                'href': link.get('href'),
                'text': link.text.strip()
            })

        # Extract LaTeX equations under current <h2>
        latex_pattern = r'\$(.*?)\$|\\\[([\s\S]*?)\\\]'
        equations = h2.find_next_siblings(string=re.compile(latex_pattern))
        for equation in equations:
            if isinstance(equation, str):
                equation_texts = re.findall(latex_pattern, equation)
                for eq in equation_texts:
                    equation_text = next(filter(None, eq))
                    section_data['equations'].append(equation_text.strip())

        # Extract ordered lists (ol) under current <h2>
        ordered_lists = h2.find_next_siblings('ol')
        for ol in ordered_lists:
            if ol.find_previous_sibling('h2') == h2:
                list_items = ol.find_all('li')
                list_data = [li.text.strip() for li in list_items]
                section_data['ordered_lists'].append(list_data)

        # Extract unordered lists (ul) under current <h2>
        unordered_lists = h2.find_next_siblings('ul')
        for ul in unordered_lists:
            if ul.find_previous_sibling('h2') == h2:
                list_items = ul.find_all('li')
                list_data = [li.text.strip() for li in list_items]
                section_data['unordered_lists'].append(list_data)

        # Add current section data to main dictionary under the current <h2> key
        scraped_data[section_title] = section_data

# Serialize scraped data to JSON
json_data = json.dumps(scraped_data, indent=4)

# Write JSON data to a file
with open('scraped_datav1.json', 'w') as json_file:
    json_file.write(json_data)

print("Data has been scraped and stored in scraped_data.json.")


Data has been scraped and stored in scraped_data.json.
