In [1]:
import os
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from openai import OpenAI

# 1. Load OpenAI API key from .env file
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
openai = OpenAI()

# 2. Scrape and clean text from a website
def scrape_website(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Remove unwanted tags
    for tag in soup(["script", "style", "img", "input"]):
        tag.decompose()
    
    title = soup.title.string if soup.title else "Untitled Page"
    text = soup.body.get_text(separator="\n", strip=True)
    
    return title, text

# 3. Create system and user prompts
def summarize_text(title, full_text):
    system_msg = "You are a helpful assistant that summarizes long website content in clean bullet points."
    user_msg = f"The website is titled '{title}'. Please summarize the content:\n\n{full_text[:3000]}"  # Limit to ~3k chars
    
    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg}
        ]
    )
    return response.choices[0].message.content

# 4. Run everything together
url = "https://en.wikipedia.org/wiki/Web_scraping"  # You can change this!
title, text = scrape_website(url)
summary = summarize_text(title, text)

print("\n🔹 Summary of the Page:\n")
print(summary)


🔹 Summary of the Page:

- **Definition**:
  - Web scraping, also known as web harvesting or web data extraction, is a technique used to extract data from websites.
  - It involves fetching web pages and extracting useful data, typically copying it to a local database or spreadsheet for analysis.

- **Techniques**:
  - **Human copy-and-paste**: Manual extraction by a user.
  - **Text pattern matching**: Using string matching to locate data.
  - **HTTP programming**: Interacting with websites through HTTP requests.
  - **HTML parsing**: Analyzing the HTML structure to extract data.
  - **DOM parsing**: Manipulating the document object model to get information.
  - **Vertical aggregation**: Aggregating data within a specific domain.
  - **Semantic annotation recognizing**: Leveraging metadata for data scraping.
  - **Computer vision web-page analysis**: Using computer vision to interpret page layouts.
  - **AI-powered document understanding**: Employing AI to comprehend and extract infor