In [8]:
import json
from bs4 import BeautifulSoup
import re
import requests

# URL of the lecture notes
url = 'https://stanford-cs324.github.io/winter2022/lectures/introduction/'

# Fetch the content from the URL
response = requests.get(url)

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

main_content_div = soup.find('div', class_='main-content', id='main-content')

html_content = ""

if main_content_div:
    # Step 3: Extract the inner HTML of the
    html_content = main_content_div.decode_contents()

# Initialize a dictionary to store scraped data
scraped_data = {
    'h2_sections': [],
    'paragraphs': [],
    'tables': [],
    'links': [],
    'equations': []
}

# Extracting  sections
h2_sections = soup.find_all('h2')
for h2 in h2_sections:
    scraped_data['h2_sections'].append(h2.text.strip())

# Extracting  sections
paragraphs = soup.find_all('p')
for p in paragraphs:
    scraped_data['paragraphs'].append(p.text.strip())

# Extracting  sections (if any)
tables = soup.find_all('table')
for table in tables:
    # Convert table to a string representation for storage (if needed)
    scraped_data['tables'].append(str(table))

# Extracting  tags
links = soup.find_all('a')
for link in links:
    scraped_data['links'].append({
        'href': link.get('href'),
        'text': link.text.strip()
    })

# Extracting LaTeX equations
latex_pattern = r'$(.*?)$|\\§'
equations = re.findall(latex_pattern, html_content)
for equation in equations:
    # 'equation' is now a tuple, so we need to select the first non-empty string
    equation_text = next(filter(None, equation)) if isinstance(equation, tuple) else equation
    scraped_data['equations'].append(equation_text.strip())

# Serialize scraped data to JSON
json_data = json.dumps(scraped_data, indent=4)

# Write JSON data to a file
with open('scraped_datav1.json', 'w') as json_file:
    json_file.write(json_data)

print("Data has been scraped and stored in scraped_data.json.")



Data has been scraped and stored in scraped_data.json.


In [9]:
import json
from bs4 import BeautifulSoup
import re
import requests

# URL of the lecture notes
url = 'https://stanford-cs324.github.io/winter2022/lectures/introduction/'

# Fetch the content from the URL
response = requests.get(url)

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

main_content_div = soup.find('div', class_='main-content', id='main-content')

# Initialize a dictionary to store scraped data
scraped_data = {}

if main_content_div:
    # Extract all  tags
    h2_sections = main_content_div.find_all('h2')

    for h2 in h2_sections:
        section_title = h2.text.strip()
        section_data = {
            'paragraphs': [],
            'tables': [],
            'links': [],
            'equations': [],
            'ordered_lists': [],
            'unordered_lists': []
        }

        # Extract paragraphs under current
        paragraphs = h2.find_next_siblings('p')
        for p in paragraphs:
            section_data['paragraphs'].append(p.text.strip())

        # Extract tables under current
        tables = h2.find_next_siblings('table')
        for table in tables:
            section_data['tables'].append(str(table))

        # Extract links under current
        links = h2.find_next_siblings('a')
        for link in links:
            section_data['links'].append({
                'href': link.get('href'),
                'text': link.text.strip()
            })

        # Extract LaTeX equations under current section
        latex_pattern = r'$(.*?)$|\\§'
        equations = h2.find_next_siblings(string=re.compile(latex_pattern))
        for equation in equations:
            if isinstance(equation, str):
                equation_texts = re.findall(latex_pattern, equation)
                for eq in equation_texts:
                    # Ensure that eq contains a non-None value
                    equation_text = next((text for text in eq if text), None)
                    if equation_text:  # Only append if a valid equation text was found
                        section_data['equations'].append(equation_text.strip())

        # Extract ordered lists (ol) under current
        ordered_lists = h2.find_next_siblings('ol')
        for ol in ordered_lists:
            if ol.find_previous_sibling('h2') == h2:
                list_items = ol.find_all('li')
                list_data = [li.text.strip() for li in list_items]
                section_data['ordered_lists'].append(list_data)

        # Extract unordered lists (ul) under current
        unordered_lists = h2.find_next_siblings('ul')
        for ul in unordered_lists:
            if ul.find_previous_sibling('h2') == h2:
                list_items = ul.find_all('li')
                list_data = [li.text.strip() for li in list_items]
                section_data['unordered_lists'].append(list_data)

        # Add current section data to main dictionary under the current  key
        scraped_data[section_title] = section_data

# Serialize scraped data to JSON
json_data = json.dumps(scraped_data, indent=4)

# Write JSON data to a file
with open('scraped_data.json', 'w') as json_file:
    json_file.write(json_data)

print("Data has been scraped and stored in scraped_data.json.")

Data has been scraped and stored in scraped_data.json.


In [10]:
import json
from bs4 import BeautifulSoup
import re
import requests

# URL of the lecture notes
url = 'https://stanford-cs324.github.io/winter2022/lectures/introduction/'

# Fetch the content from the URL
response = requests.get(url)

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

main_content_div = soup.find('div', class_='main-content', id='main-content')

# Initialize a dictionary to store scraped data
scraped_data = {}
current_h2_title = ""

if main_content_div:
    # Extract all  and  tags
    h2_h3_sections = main_content_div.find_all(['h2', 'h3'])

    for index, tag in enumerate(h2_h3_sections):
        # Determine the current section title and type (h2 or h3)
        # section_title = tag.text.strip()
        # section_type = tag.name
        if tag.name == 'h2':
            current_h2_title = tag.text.strip()
            section_title = current_h2_title
        elif tag.name == 'h3':
            section_title = f"{tag.text.strip()} {current_h2_title}"

        # Initialize section data for the current  or  section
        section_data = {
            'paragraphs': [],
            'tables': [],
            'links': [],
            'equations': [],
            'ordered_lists': [],
            'unordered_lists': []
        }

        # Find next  or  tag, or end of siblings
        next_tag = h2_h3_sections[index + 1] if index + 1 < len(h2_h3_sections) else None

        # Extract content between current tag and next  or
        current_tag = tag.find_next_sibling()
        while current_tag and (current_tag.name != 'h2' and current_tag.name != 'h3' and current_tag != next_tag):
            if current_tag.name == 'p':
                section_data['paragraphs'].append(current_tag.text.strip())
            elif current_tag.name == 'table':
                section_data['tables'].append(str(current_tag))
            elif current_tag.name == 'a':
                section_data['links'].append({
                    'href': current_tag.get('href'),
                    'text': current_tag.text.strip()
                })
            elif current_tag.name and re.match(r'(div|ul|ol|h[1-6])', current_tag.name):
                # Check if current_tag.name is not None and matches the specified tags
                section_data['equations'].append(current_tag.text.strip())

            # Move to the next sibling
            current_tag = current_tag.find_next_sibling()
            scraped_data[section_title] = section_data

In [11]:
import json
from bs4 import BeautifulSoup
import requests

# URL of the lecture notes
url = 'https://stanford-cs324.github.io/winter2022/lectures/introduction/'

# Fetch the content from the URL
response = requests.get(url)

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

main_content_div = soup.find('div', class_='main-content', id='main-content')

# Initialize a dictionary to store scraped data
scraped_data_1 = {}

if main_content_div:
    # Extract all <strong> tags
    strong_tags = main_content_div.find_all('strong')

    for strong_tag in strong_tags:
        # Initialize the data for this <strong> tag
        strong_text = strong_tag.text.strip()
        strong_data = {
            'paragraphs': [],
            'tables': [],
            'links': [],
            'equations': [],
            'ordered_lists': [],
            'unordered_lists': []
        }

        # Find parent <p> tag of the <strong> tag
        parent_p_tag = strong_tag.find_parent('p')
        if parent_p_tag:
            # Include the parent <p> tag's text
            strong_data['paragraphs'].append(parent_p_tag.text.strip())

            # Include previous and next sibling <p> tags
            previous_sibling = parent_p_tag.find_previous_sibling('p')
            next_sibling = parent_p_tag.find_next_sibling('p')

            if previous_sibling:
                strong_data['paragraphs'].append(previous_sibling.text.strip())

            if next_sibling:
                strong_data['paragraphs'].append(next_sibling.text.strip())

        # Add the data to the main dictionary under the <strong> tag text as key
        scraped_data_1[strong_text] = strong_data

# Initialize the main dictionary for merging (empty or preloaded)
scraped_data = {}

# Merge data from scraped_data_1 into scraped_data
for key in scraped_data_1:
    if key in scraped_data:
        for sub_key in scraped_data_1[key]:
            if sub_key in scraped_data[key]:
                # Extend lists for matching keys
                if isinstance(scraped_data[key][sub_key], list):
                    scraped_data[key][sub_key].extend(scraped_data_1[key][sub_key])
            else:
                # Add new keys directly
                scraped_data[key][sub_key] = scraped_data_1[key][sub_key]
    else:
        # Add the entire key if it doesn't exist in scraped_data
        scraped_data[key] = scraped_data_1[key]

# Serialize merged data back into JSON format
merged_json = json.dumps(scraped_data, indent=4)

# Save merged JSON data to a file
with open('scraped_data.json', 'w') as json_file:
    json_file.write(merged_json)

print("Merged data has been saved to scraped_data.json.")


Merged data has been saved to scraped_data.json.


In [12]:
!pip install faiss-gpu transformers sentence-transformers
!pip install markdown2



In [13]:
import json
from markdown2 import markdown
from IPython.display import display, Markdown

# Load your JSON data
with open('scraped_data.json', 'r') as f:
    data = json.load(f)

# # Prepare data for embedding and retrieval
keys = list(data.keys())
texts = []

for content in data.values():
    paragraphs = content.get('paragraphs', [])
    ordered_lists = sum(content.get('ordered_lists', []), [])
    unordered_lists = sum(content.get('unordered_lists', []), [])
    tables = sum(content.get('tables', []), [])
    links = sum(content.get('links', []), [])
    equations = content.get('equations', [])

    # Concatenate all text elements
    text_content = " ".join(paragraphs + ordered_lists + unordered_lists + tables + equations + links)

#     keys.append(content['title'])  # Assuming 'title' is a key in your scraped data for section titles
    texts.append(text_content)

print(f"Loaded {len(keys)} pieces of text from the JSON data.")

Loaded 79 pieces of text from the JSON data.


In [14]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to get embeddings
def get_embeddings(texts):
    return model.encode(texts)

# Generate embeddings for texts and keys
text_embeddings = get_embeddings(texts)
key_embeddings = get_embeddings(keys)
print(f"Generated embeddings for {len(text_embeddings)} pieces of text and {len(key_embeddings)} keys.")

Generated embeddings for 79 pieces of text and 79 keys.


In [15]:
import faiss
import numpy as np

# Convert embeddings to numpy array
text_embeddings = np.array(text_embeddings)
key_embeddings = np.array(key_embeddings)

# Initialize FAISS index for texts and keys
text_index = faiss.IndexFlatL2(text_embeddings.shape[1])
key_index = faiss.IndexFlatL2(key_embeddings.shape[1])

text_index.add(text_embeddings)
key_index.add(key_embeddings)

# Save indices
faiss.write_index(text_index, 'text_vector_db.index')
faiss.write_index(key_index, 'key_vector_db.index')

In [16]:
# Install required libraries
!pip install faiss-cpu sentence-transformers

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from collections import defaultdict

# Sample data: keys and their corresponding texts
data = {
    "Section1": "This is text for section 1.",
    "Section2": "Details about section 2 are mentioned here.",
    "Section3": "Section 3 discusses advanced topics.",
}
keys = list(data.keys())
texts = list(data.values())

# Initialize the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose other models too

# Generate embeddings for the texts
print("Generating embeddings...")
embeddings = np.array(model.encode(texts, show_progress_bar=True)).astype('float32')

# Build a FAISS index
print("Building the FAISS index...")
dimension = embeddings.shape[1]  # Embedding size
key_index = faiss.IndexFlatL2(dimension)  # Use L2 (Euclidean distance)
key_index.add(embeddings)  # Add embeddings to the index

# Save the index to a file
faiss_index_path = '/content/key_vector_db.index'
faiss.write_index(key_index, faiss_index_path)
print(f"Index saved to {faiss_index_path}")

# Query processing function
def process_query(query, key_index, top_k=2):
    # Generate query embedding
    query_embedding = np.array(model.encode([query])).astype('float32')

    # Search for similar keys
    distances, indices = key_index.search(query_embedding, top_k)

    # Retrieve the most relevant keys and their content
    result_keys = []
    combined_result_content = defaultdict(list)

    for idx in indices[0]:
        if idx == -1:  # Handle cases with no results
            continue
        result_key = keys[idx]
        result_keys.append(result_key)
        result_content = data[result_key]

        # Combine content
        combined_result_content[result_key].append(result_content)

    # Convert defaultdict to a regular dictionary
    combined_result_content = dict(combined_result_content)
    combined_result_keys = ', '.join(result_keys)

    return combined_result_keys, combined_result_content

# Example usage: Query the index
query = "advanced topics"
print("\nProcessing query...")
result_keys, result_content = process_query(query, key_index)
print(f"Relevant sections: {result_keys}")
print("\nContent:", result_content)

# Reload index from file
print("\nTesting index reload...")
reloaded_index = faiss.read_index(faiss_index_path)
result_keys, result_content = process_query(query, reloaded_index)
print(f"Relevant sections (after reload): {result_keys}")
print("\nContent:", result_content)


Generating embeddings...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Building the FAISS index...
Index saved to /content/key_vector_db.index

Processing query...
Relevant sections: Section3, Section1

Content: {'Section3': ['Section 3 discusses advanced topics.'], 'Section1': ['This is text for section 1.']}

Testing index reload...
Relevant sections (after reload): Section3, Section1

Content: {'Section3': ['Section 3 discusses advanced topics.'], 'Section1': ['This is text for section 1.']}


In [17]:
from transformers import pipeline

# Load a text generation pipeline (e.g., GPT-2)
generator = pipeline('text-generation', model='gpt2')

def generate_structured_response(query, result_keys, result_content):
    """
    Generate a structured response based on the query, relevant keys, and content.

    Args:
        query (str): The user's query.
        result_keys (str): Comma-separated string of the relevant keys.
        result_content (dict): Content associated with the relevant keys.

    Returns:
        str: Generated text response.
    """
    # Create a structured prompt
    prompt = f"**Question:** {query}\n\n"
    prompt += f"**Relevant Sections:** {result_keys}\n\n"

    # Add content to the prompt
    for section, content_list in result_content.items():
        prompt += f"**Section {section}:**\n"
        for item in content_list:
            prompt += f"- {item}\n"
        prompt += "\n"

    # Add a closing statement
    prompt += "Answer is:"

    # Define max_length to ensure prompt is not excessively long
    max_length = min(len(prompt) + 100, 750)

    # Generate response using GPT-2 pipeline
    response = generator(prompt[:750], max_length=max_length, num_return_sequences=1, truncation=True, pad_token_id=50256)
    generated_text = response[0]['generated_text']

    return generated_text

# Example usage
query = "advanced topics"
result_keys = "Section3"
result_content = {"Section3": ["Section 3 discusses advanced topics.", "More details on section 3."]}

response = generate_structured_response(query, result_keys, result_content)
print(response)


**Question:** advanced topics

**Relevant Sections:** Section3

**Section Section3:**
- Section 3 discusses advanced topics.
- More details on section 3.

Answer is:** You're missing information here.

If you had any suggestions, feel free to post them!


In [42]:
def display_good(generated_text):
    # Convert Markdown with LaTeX support to HTML
    html_content = markdown(generated_text, extras=["fenced-code-blocks", "cuddled-lists", "footnotes", "tables", "toc", "smarty-pants", "mathjax","latex"])

    # Display the rendered HTML as Markdown
    display(Markdown(html_content))

# Generate structured response
query = "What is Conditional generation in language model?"
result_key, result_content = process_query(query, key_index)

# Generate structured response
generated_text = generate_structured_response(query, result_key, result_content)

print(generated_text)

**Question:** What is Conditional generation in language model?

**Relevant Sections:** Section1, Section2

**Section Section1:**
- This is text for section 1.

**Section Section2:**
- Details about section 2 are mentioned here.

Answer is: **The key concept of Conditional generation is the use of an abstraction that is a collection of elements (as opposed to a list of elements) that contain only the elements that are of the same type as their other elements: - the list and the empty element (for example "this").

- If you want to call this type of expression, you will have to define a function that calls the method "callback". - Conditional Generation is not an abstraction.


CODE:

- This is the same as in the definition of "concurrent generation".

- Let us assume as you know, such a new version of Conditional Generation can be obtained. - What does the following type of expression mean? - Let us give the following description of the type: - C is this kind of expression? The variabl