In [5]:
import requests
from bs4 import BeautifulSoup
import re
import json
from google.colab import files

# URL of the page to scrape
url = 'https://lucidowners.com/threads/updated-owners-manual.5283/'

# Send an HTTP GET request
response = requests.get(url)

# Check for successful response
if response.status_code == 200:
    # Parse HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract text from divs with class 'bbwrapper'
    content = []
    for section in soup.find_all('div', class_='bbWrapper'):
        text = section.get_text(strip=True)
        if text:
            # Remove excessive whitespace and clean text
            text = re.sub(r'\s+', ' ', text)
            content.append({"text": text})

    # Save content to JSON file
    json_filename = 'lucid_owners_manual.json'
    with open(json_filename, 'w', encoding='utf-8') as file:
        json.dump(content, file, ensure_ascii=False, indent=4)

    print("Content saved in JSON format for chunking.")

    # Download JSON file
    files.download(json_filename)
else:
    print("Failed to retrieve the page.")


Content saved in JSON format for chunking.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
import requests
from bs4 import BeautifulSoup
import json
from google.colab import files
from transformers import pipeline

# Function to perform semantic chunking
def semantic_chunk(text, chunk_size=50):
    # Use a pre-trained summarization model
    summarizer = pipeline("summarization")

    # Split the text into chunks and summarize
    chunks = []
    sentences = text.split('. ')
    for i in range(0, len(sentences), chunk_size):
        chunk = '. '.join(sentences[i:i + chunk_size])
        summarized_chunk = summarizer(chunk, max_length=50, min_length=25, do_sample=False)
        chunks.append(summarized_chunk[0]['summary_text'])

    return chunks

# URL of the page to scrape
url = 'https://lucidowners.com/threads/updated-owners-manual.5283/'

# Send an HTTP GET request
response = requests.get(url)

# Check for successful response
if response.status_code == 200:
    # Parse HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract messages and their details
    messages = []
    for message in soup.find_all('article', class_='message message--post'):
        message_details = {}

        # Get the message content
        content = message.find('div', class_='bbWrapper')
        if content:
            full_content = content.get_text(strip=True)
            # Perform semantic chunking
            message_details['chunks'] = semantic_chunk(full_content)

        # Get author details
        author = message['data-author']
        message_details['author'] = author

        # Get post timestamp
        timestamp = message.find('time', class_='u-dt')
        if timestamp:
            message_details['timestamp'] = timestamp['datetime']

        messages.append(message_details)

    # Save messages to JSON file
    json_filename = 'lucid_owners_messages.json'
    with open(json_filename, 'w', encoding='utf-8') as file:
        json.dump(messages, file, ensure_ascii=False, indent=4)

    print("Messages saved in JSON format.")

    # Download JSON file
    files.download(json_filename)
else:
    print("Failed to retrieve the page.")


Messages saved in JSON format.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>