In [1]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions

In [2]:
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key = api_key)

service = webdriver.ChromeService(executable_path = "C:\\Program Files\\chromedriver-win64\\chromedriver.exe")
# so webpage won't pop up, better performance
chrome_options = ChromeOptions()
chrome_options.add_argument("--headless")

### 1 - define website object
- stores essential messages
- use Selenium and chrome webdriver, so can parse JavaScript dynamic page, instead of using get request
    - use `chrome://version/`
    - download corresponding version driver from `https://googlechromelabs.github.io/chrome-for-testing/`
    - chrome_options for headless

In [3]:
class Website:
    def __init__(self, url):
        self.url = url
        response = self._scrape()
        soup = BeautifulSoup(response, 'html.parser')
        self.title = soup.title.string if soup.title else "No Title Found"
        for irrelevant in soup.body(['script', 'style', 'img', 'input']):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator = '\n', strip = True)

    def _scrape(self) -> str:
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.get(self.url)
        driver.implicitly_wait(0.00001)
        page = driver.page_source
        driver.close()
        return page

In [4]:
ed = Website('https://sites.usc.edu/eessc/')
print('===TITLE===\n', ed.title)
print('===TEXT===\n', ed.text)

===TITLE===
 Energy Efficient Secure Sustainable Computing Group
===TEXT===
 Log In
Search
Skip to content
Show search field
Search
Energy Efficient Secure Sustainable Computing Group
open menu
Home
Research Areas
open dropdown menu
Machine Learning Algorithm & Hardware Co-design
Hardware Security
Superconducting Electronics
Asynchronous VLSI
Interdisciplinary Research
People
Prospective Students
Recent Publications
Welcome to the Lab!
The E
2
S
2
C group, led by Professor Peter A. Beerel, has active research efforts spanning circuits, micro-architecture, and algorithms that target a variety of emerging areas in energy-efficient, secure, and sustainable computing. The group is guided by academic curiosity, integrity, and the spirit of collaboration to solve real-world problems using the wide array of mathematics that make up the foundation of Electrical and Computer Engineering.
The group’s current research projects include topics in machine-learning algorithm hardware co-design, super

### 2 - Calling Summary API
- message format: \[{"role": "system", "content": "system"}, {"role": "user", "content": "user_message"}\]

In [5]:
system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

def create_user_prompt(website: Website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nFor the following website content, \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\
Here is the website: \n<website>"
    user_prompt += website.text + "\n</website>"
    return user_prompt

In [6]:
print(create_user_prompt(ed))

You are looking at a website titled Energy Efficient Secure Sustainable Computing Group
For the following website content, please provide a short summary of this website in markdown. If it includes news or announcements, then summarize these too.
Here is the website: 
<website>Log In
Search
Skip to content
Show search field
Search
Energy Efficient Secure Sustainable Computing Group
open menu
Home
Research Areas
open dropdown menu
Machine Learning Algorithm & Hardware Co-design
Hardware Security
Superconducting Electronics
Asynchronous VLSI
Interdisciplinary Research
People
Prospective Students
Recent Publications
Welcome to the Lab!
The E
2
S
2
C group, led by Professor Peter A. Beerel, has active research efforts spanning circuits, micro-architecture, and algorithms that target a variety of emerging areas in energy-efficient, secure, and sustainable computing. The group is guided by academic curiosity, integrity, and the spirit of collaboration to solve real-world problems using the w

In [7]:
def summarize(url, system_prompt = system_prompt):
    website = Website(url)
    user_prompt = create_user_prompt(website)
    response = client.chat.completions.create(
        model = "gpt-4o-mini",
        messages = [{"role": "system", "content": system_prompt}, 
                    {"role": "user", "content": user_prompt}]
    )
    return response.choices[0].message.content

In [8]:
response_msg = summarize('https://cnn.com')
print(response_msg)

# Summary of CNN Website Content

CNN provides a comprehensive news platform covering a variety of topics including:

- **US and World News**: Updates on current events such as immigration raids in Southern California and discussions on political figures like Trump, who is facing scrutiny for his recent flood response.
- **Political Insights**: Analysis of ongoing political tensions, including controversies surrounding Forex and tariffs, notably Trump’s threats of tariffs on Canada and Brazil.
- **Health**: Articles discussing recent health trends such as mouth taping and legislative movements around menopause care, highlighting various ongoing health issues and recommendations.
- **Entertainment and Culture**: Coverage on upcoming events, celebrity news, and cultural phenomena, including highlights from major sporting events and films, with mentions of celebrity releases and cultural commentary.
- **Market News**: Updates on stock market performances, particularly in relation to tech 