In [32]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = "https://stanford-cs324.github.io/winter2022/lectures/introduction"
html = urlopen(url).read()
soup = BeautifulSoup(html, features="html.parser")

# kill all script and style elements
for script in soup(["script", "style"]):
    script.extract()    # rip it out

# get text
text = soup.get_text()

# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)

print(text)

Introduction | CS324
Link
Search
Menu
Expand
Document
CS324
HomeCalendarLecturesIntroductionCapabilitiesHarms IHarms IIDataSecurityLegalityModelingTrainingParallelismScaling lawsSelective architecturesAdaptationEnvironmental impactPaper reviewsPaper discussionsProjects This site uses Just the Docs, a documentation theme for Jekyll.
LecturesIntroduction \[\newcommand{\sV}{\mathcal{V}} \newcommand{\nl}[1]{\textsf{#1}} \newcommand{\generate}[1]{\stackrel{#1}{\rightsquigarrow}}\]Welcome to CS324! This is a new course on understanding and developing large language models.What is a language model?A brief historyWhy does this course exist?Structure of this course
What is a language model?The classic definition of a language model (LM) is a probability distribution over sequences of tokens. Suppose we have a vocabulary \(\sV\) of a set of tokens. A language model \(p\) assigns each sequence of tokens \(x_1, \dots, x_L \in \sV\) a probability (a number between 0 and 1):\[p(x_1, \dots, x_L).\]Th

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

def fetch_lecture(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def preprocess_lecture(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # Extract text content
    text_content = soup.get_text(separator=' ', strip=True)
    
    # Extract links
    links = [a['href'] for a in soup.find_all('a', href=True)]
    
    # Extract tables
    tables = []
    for table in soup.find_all('table'):
        headers = table.find_all("th")
        titles = []
        for i in headers:
            title = i.text
            titles.append(title)
        rows = table.find_all("tr")
        rows_data = []
        for i in rows:
            data = i.text
            rows_data.append(data)
        table_data = []
        table_data.append(headers)
        table_data.append(rows_data)
        tables.append(table_data)
    
    # Extract images
    images = [img['src'] for img in soup.find_all('img', src=True)]
    
    return {
        'text': text_content,
        'links': links,
        'tables': tables,
        'images': images
    }

lecture_urls = [
    "https://stanford-cs324.github.io/winter2022/lectures/capabilities/",
    # Add more lecture URLs as needed
]

lectures_data = [preprocess_lecture(fetch_lecture(url)) for url in lecture_urls]


In [2]:
lectures_data

[{'text': 'Capabilities | CS324 Link Search Menu Expand Document CS324 Home Calendar Lectures Introduction Capabilities Harms I Harms II Data Security Legality Modeling Training Parallelism Scaling laws Selective architectures Adaptation Environmental impact Paper reviews Paper discussions Projects This site uses Just the Docs , a documentation theme for Jekyll. Lectures Capabilities \\[\\newcommand{\\nl}[1]{\\textsf{#1}} \\newcommand{\\generate}[1]{\\stackrel{#1}{\\rightsquigarrow}} \\newcommand{\\perplexity}{\\text{perplexity}}\\] In this lecture, we will explore the capabilities of GPT-3, the canonical large language model. We will closely follow the benchmarks from the GPT-3 paper , which include: standard NLP benchmarks (e.g., question answering), as well as quirky one-off demos (e.g., using a new word in a sentence). In comparison with the state-of-the-art-result for each task, the results are mixed : On some tasks such as language modeling, GPT-3 exceeds the state-of-the-art by 

In [9]:
import requests
from bs4 import BeautifulSoup
import re

def fetch_lecture(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def preprocess_lecture(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # Extract text content
    text_content = soup.get_text(separator=' ', strip=True)
    
    # Extract links
    links = [a['href'] for a in soup.find_all('a', href=True)]
    
    # Extract tables
    tables = []
    for table in soup.find_all('table'):
        table_data = []
        for row in table.find_all('tr'):
            row_data = [cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])]
            table_data.append(row_data)
        tables.append(table_data)
    
    # Extract images
    images = [img['src'] for img in soup.find_all('img', src=True)]
    
    return {
        'text': text_content,
        'links': links,
        'tables': tables,
        'images': images
    }

lecture_urls = [
    "https://stanford-cs324.github.io/winter2022/lectures/introduction/",
    "https://stanford-cs324.github.io/winter2022/lectures/capabilities/",
    # Add more lecture URLs as needed
]

lectures_data = [preprocess_lecture(fetch_lecture(url)) for url in lecture_urls]


In [10]:
lectures_data

[{'text': 'Introduction | CS324 Link Search Menu Expand Document CS324 Home Calendar Lectures Introduction Capabilities Harms I Harms II Data Security Legality Modeling Training Parallelism Scaling laws Selective architectures Adaptation Environmental impact Paper reviews Paper discussions Projects This site uses Just the Docs , a documentation theme for Jekyll. Lectures Introduction \\[\\newcommand{\\sV}{\\mathcal{V}} \\newcommand{\\nl}[1]{\\textsf{#1}} \\newcommand{\\generate}[1]{\\stackrel{#1}{\\rightsquigarrow}}\\] Welcome to CS324! This is a new course on understanding and developing large language models . What is a language model? A brief history Why does this course exist? Structure of this course What is a language model? The classic definition of a language model (LM) is a probability distribution over sequences of tokens . Suppose we have a vocabulary \\(\\sV\\) of a set of tokens. A language model \\(p\\) assigns each sequence of tokens \\(x_1, \\dots, x_L \\in \\sV\\) a pr