In [30]:
import json
from bs4 import BeautifulSoup
import re
import requests

# URL of the lecture notes
url = 'https://stanford-cs324.github.io/winter2022/lectures/introduction/'

# Fetch the content from the URL
response = requests.get(url)

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

main_content_div = soup.find('div', class_='main-content', id='main-content')

html_content = ""

if main_content_div:
    # Step 3: Extract the inner HTML of the <div>
    html_content = main_content_div.decode_contents()

# Initialize a dictionary to store scraped data
scraped_data = {
    'h2_sections': [],
    'paragraphs': [],
    'tables': [],
    'links': [],
    'equations': []
}

# Extracting <h2> sections
h2_sections = soup.find_all('h2')
for h2 in h2_sections:
    scraped_data['h2_sections'].append(h2.text.strip())

# Extracting <p> sections
paragraphs = soup.find_all('p')
for p in paragraphs:
    scraped_data['paragraphs'].append(p.text.strip())

# Extracting <table> sections (if any)
tables = soup.find_all('table')
for table in tables:
    # Convert table to a string representation for storage (if needed)
    scraped_data['tables'].append(str(table))

# Extracting <a> tags
links = soup.find_all('a')
for link in links:
    scraped_data['links'].append({
        'href': link.get('href'),
        'text': link.text.strip()
    })

# Extracting LaTeX equations
latex_pattern = r'\$(.*?)\$|\\\[([\s\S]*?)\\\]'
equations = re.findall(latex_pattern, html_content)
for equation in equations:
    equation_text = next(filter(None, equation))
    scraped_data['equations'].append(equation_text.strip())

# Serialize scraped data to JSON
json_data = json.dumps(scraped_data, indent=4)

# Write JSON data to a file
with open('scraped_datav1.json', 'w') as json_file:
    json_file.write(json_data)

print("Data has been scraped and stored in scraped_data.json.")


Data has been scraped and stored in scraped_data.json.


In [4]:
import json
from bs4 import BeautifulSoup
import re
import requests

# URL of the lecture notes
url = 'https://stanford-cs324.github.io/winter2022/lectures/introduction/'

# Fetch the content from the URL
response = requests.get(url)

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

main_content_div = soup.find('div', class_='main-content', id='main-content')

# Initialize a dictionary to store scraped data
scraped_data = {}

if main_content_div:
    # Extract all <h2> tags
    h2_sections = main_content_div.find_all('h2')
    
    for h2 in h2_sections:
        section_title = h2.text.strip()
        section_data = {
            'paragraphs': [],
            'tables': [],
            'links': [],
            'equations': [],
            'ordered_lists': [],
            'unordered_lists': []
        }
        
        # Extract paragraphs under current <h2>
        paragraphs = h2.find_next_siblings('p')
        for p in paragraphs:
            section_data['paragraphs'].append(p.text.strip())

        # Extract tables under current <h2>
        tables = h2.find_next_siblings('table')
        for table in tables:
            section_data['tables'].append(str(table))

        # Extract links under current <h2>
        links = h2.find_next_siblings('a')
        for link in links:
            section_data['links'].append({
                'href': link.get('href'),
                'text': link.text.strip()
            })

        # Extract LaTeX equations under current <h2>
        latex_pattern = r'\$(.*?)\$|\\\[([\s\S]*?)\\\]'
        equations = h2.find_next_siblings(string=re.compile(latex_pattern))
        for equation in equations:
            if isinstance(equation, str):
                equation_texts = re.findall(latex_pattern, equation)
                for eq in equation_texts:
                    equation_text = next(filter(None, eq))
                    section_data['equations'].append(equation_text.strip())

        # Extract ordered lists (ol) under current <h2>
        ordered_lists = h2.find_next_siblings('ol')
        for ol in ordered_lists:
            if ol.find_previous_sibling('h2') == h2:
                list_items = ol.find_all('li')
                list_data = [li.text.strip() for li in list_items]
                section_data['ordered_lists'].append(list_data)

        # Extract unordered lists (ul) under current <h2>
        unordered_lists = h2.find_next_siblings('ul')
        for ul in unordered_lists:
            if ul.find_previous_sibling('h2') == h2:
                list_items = ul.find_all('li')
                list_data = [li.text.strip() for li in list_items]
                section_data['unordered_lists'].append(list_data)

        # Add current section data to main dictionary under the current <h2> key
        scraped_data[section_title] = section_data

# Serialize scraped data to JSON
json_data = json.dumps(scraped_data, indent=4)

# Write JSON data to a file
with open('scraped_data.json', 'w') as json_file:
    json_file.write(json_data)

print("Data has been scraped and stored in scraped_data.json.")


Data has been scraped and stored in scraped_data.json.


In [26]:
import json
from bs4 import BeautifulSoup
import re
import requests

# URL of the lecture notes
url = 'https://stanford-cs324.github.io/winter2022/lectures/introduction/'

# Fetch the content from the URL
response = requests.get(url)

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

main_content_div = soup.find('div', class_='main-content', id='main-content')

# Initialize a dictionary to store scraped data
scraped_data = {}
current_h2_title = ""

if main_content_div:
    # Extract all <h2> and <h3> tags
    h2_h3_sections = main_content_div.find_all(['h2', 'h3'])

    for index, tag in enumerate(h2_h3_sections):
        # Determine the current section title and type (h2 or h3)
        # section_title = tag.text.strip()
        # section_type = tag.name
        if tag.name == 'h2':
            current_h2_title = tag.text.strip()
            section_title = current_h2_title
        elif tag.name == 'h3':
            section_title = f"{tag.text.strip()} {current_h2_title}"

        # Initialize section data for the current <h2> or <h3> section
        section_data = {
            'paragraphs': [],
            'tables': [],
            'links': [],
            'equations': [],
            'ordered_lists': [],
            'unordered_lists': []
        }

        # Find next <h2> or <h3> tag, or end of siblings
        next_tag = h2_h3_sections[index + 1] if index + 1 < len(h2_h3_sections) else None

        # Extract content between current tag and next <h2> or <h3>
        current_tag = tag.find_next_sibling()
        while current_tag and (current_tag.name != 'h2' and current_tag.name != 'h3' and current_tag != next_tag):
            if current_tag.name == 'p':
                section_data['paragraphs'].append(current_tag.text.strip())
            elif current_tag.name == 'table':
                section_data['tables'].append(str(current_tag))
            elif current_tag.name == 'a':
                section_data['links'].append({
                    'href': current_tag.get('href'),
                    'text': current_tag.text.strip()
                })
            elif current_tag.name and re.match(r'(div|ul|ol|h[1-6])', current_tag.name):
                # Check if current_tag.name is not None and matches the specified tags
                section_data['equations'].append(current_tag.text.strip())

            # Move to the next sibling
            current_tag = current_tag.find_next_sibling()
            scraped_data[section_title] = section_data


In [27]:
import json
from bs4 import BeautifulSoup
import requests

# URL of the lecture notes
url = 'https://stanford-cs324.github.io/winter2022/lectures/introduction/'

# Fetch the content from the URL
response = requests.get(url)

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

main_content_div = soup.find('div', class_='main-content', id='main-content')

# Initialize a dictionary to store scraped data
scraped_data_1 = {}

if main_content_div:
    # Extract all <strong> tags
    strong_tags = main_content_div.find_all('strong')

    for strong_tag in strong_tags:
        # Initialize the data for this <strong> tag
        strong_text = strong_tag.text.strip()
        strong_data = {
            'paragraphs': [],
            'tables': [],
            'links': [],
            'equations': [],
            'ordered_lists': [],
            'unordered_lists': []
        }

        # Find parent <p> tag of the <strong> tag
        parent_p_tag = strong_tag.find_parent('p')
        if parent_p_tag:
            # Include the parent <p> tag's text
            strong_data['paragraphs'].append(parent_p_tag.text.strip())

            # Include previous and next sibling <p> tags
            previous_sibling = parent_p_tag.find_previous_sibling('p')
            next_sibling = parent_p_tag.find_next_sibling('p')

            if previous_sibling:
                strong_data['paragraphs'].append(previous_sibling.text.strip())

            if next_sibling:
                strong_data['paragraphs'].append(next_sibling.text.strip())

        # Add the data to the main dictionary under the <strong> tag text as key
        scraped_data_1[strong_text] = strong_data


In [28]:
# Merge data from json_2 into json_1
for key in scraped_data_1:
    if key in scraped_data:
        scraped_data[key].extend(scraped_data_1[key])
    else:
        scraped_data[key] = scraped_data_1[key]

# Serialize merged data back into JSON format
merged_json = json.dumps(scraped_data, indent=4)

# Save merged JSON data to a file
with open('scraped_data.json', 'w') as json_file:
    json_file.write(merged_json)

print("Merged data has been saved to merged_data.json.")

Merged data has been saved to merged_data.json.
