In [30]:
import json
from bs4 import BeautifulSoup
import re
import requests

# URL of the lecture notes
url = 'https://stanford-cs324.github.io/winter2022/lectures/introduction/'

# Fetch the content from the URL
response = requests.get(url)

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

main_content_div = soup.find('div', class_='main-content', id='main-content')

html_content = ""

if main_content_div:
    # Step 3: Extract the inner HTML of the <div>
    html_content = main_content_div.decode_contents()

# Initialize a dictionary to store scraped data
scraped_data = {
    'h2_sections': [],
    'paragraphs': [],
    'tables': [],
    'links': [],
    'equations': []
}

# Extracting <h2> sections
h2_sections = soup.find_all('h2')
for h2 in h2_sections:
    scraped_data['h2_sections'].append(h2.text.strip())

# Extracting <p> sections
paragraphs = soup.find_all('p')
for p in paragraphs:
    scraped_data['paragraphs'].append(p.text.strip())

# Extracting <table> sections (if any)
tables = soup.find_all('table')
for table in tables:
    # Convert table to a string representation for storage (if needed)
    scraped_data['tables'].append(str(table))

# Extracting <a> tags
links = soup.find_all('a')
for link in links:
    scraped_data['links'].append({
        'href': link.get('href'),
        'text': link.text.strip()
    })

# Extracting LaTeX equations
latex_pattern = r'\$(.*?)\$|\\\[([\s\S]*?)\\\]'
equations = re.findall(latex_pattern, html_content)
for equation in equations:
    equation_text = next(filter(None, equation))
    scraped_data['equations'].append(equation_text.strip())

# Serialize scraped data to JSON
json_data = json.dumps(scraped_data, indent=4)

# Write JSON data to a file
with open('scraped_datav1.json', 'w') as json_file:
    json_file.write(json_data)

print("Data has been scraped and stored in scraped_data.json.")


Data has been scraped and stored in scraped_data.json.
