In [49]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions

import ollama

from pydantic import BaseModel
from typing import List, Tuple, Dict

In [12]:
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key = api_key)

service = webdriver.ChromeService(executable_path = "C:\\Program Files\\chromedriver-win64\\chromedriver.exe")
# so webpage won't pop up, better performance
chrome_options = ChromeOptions()
chrome_options.add_argument("--headless")

MODEL = 'gpt-4o-mini'

### 1 - define website object
- stores essential messages
- uses selenium, also scrapes links

In [3]:
class Website:
    def __init__(self, url):
        self.url = url
        response = self._scrape()
        soup = BeautifulSoup(response, 'html.parser')
        self.title = soup.title.string if soup.title else "No Title Found"
        for irrelevant in soup.body(['script', 'style', 'img', 'input']):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator = '\n', strip = True)
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link and link[:4] == 'http']

    def _scrape(self) -> str:
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.get(self.url)
        driver.implicitly_wait(0.001)
        page = driver.page_source
        driver.close()
        return page

    def get_content(self):
        return f"Webpage Title: \n{self.title}\nWebpage Contents: \n{self.text}\n\n" 

In [14]:
ed = Website('https://www.pinecone.io/')
print(ed.links)

['https://docs.pinecone.io', 'https://app.pinecone.io/?sessionType=login', 'https://app.pinecone.io/?sessionType=signup', 'https://app.pinecone.io/', 'https://docs.pinecone.io/docs/get-started/overview', 'https://docs.pinecone.io/models/overview', 'https://docs.pinecone.io/reference/architecture/serverless-architecture', 'https://docs.pinecone.io/guides/data/query-data#filter-by-metadata', 'https://docs.pinecone.io/reference/architecture/serverless-architecture', 'https://docs.pinecone.io/guides/indexes/understanding-indexes#sparse-indexes', 'https://docs.pinecone.io/guides/inference/rerank', 'https://docs.pinecone.io/guides/inference/rerank', 'https://docs.pinecone.io/guides/indexes/implement-multitenancy', 'https://docs.pinecone.io/guides/indexes/implement-multitenancy', 'https://docs.pinecone.io/integrations/overview', 'https://app.pinecone.io/', 'https://x.com/pinecone', 'https://www.linkedin.com/company/pinecone-io', 'https://www.youtube.com/@pinecone-io', 'https://github.com/pine

### 2 - gpt-4o-mini identify relevant links 

In [None]:
class Link(BaseModel):
    link_type: str
    url: str

class Links(BaseModel):
    links: List[Link]

In [63]:
link_system_prompt = """You are provided with a list of links found on a webpage. 
You are able to decide which of the links would be most relevant to include in a brochure about the company, 
such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON like in the following examples:
<example1>
{
    "links": [
        {"link_type": "about page", "url": "https://full.url/goes/here/about"},
        {"link_type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
</example1>

<example2>
{
    "links": [
        {"link_type": "community", "url": "https://full.url/community"},
        {"link_type": "background", "url": "https://another.full.url/background"},
        {"link_type": "team", "url": "https://another.full.url/members/team"}
    ]
}
</example2>
"""

In [28]:
def link_user_prompt(website: Website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

def get_links(website: Website):
    completion = client.beta.chat.completions.parse(
        model = MODEL,
        messages = [
            {"role": "system", "content": link_system_prompt},
            {"role": "system", "content": link_user_prompt(website)}
        ], 
        response_format = Links
    )
    
    response = completion.choices[0].message
    links = response.parsed
    return links

In [29]:
web = Website('https:/anthropic.com')
res = get_links(web)
print(res)
for link in res.links:
    print(link.url)

links=[Link(link_type='company page', url='https://www.anthropic.com/company'), Link(link_type='about page', url='https://www.anthropic.com/team'), Link(link_type='careers page', url='https://www.anthropic.com/careers'), Link(link_type='news page', url='https://www.anthropic.com/news'), Link(link_type='contact sales page', url='https://www.anthropic.com/contact-sales'), Link(link_type='events page', url='https://www.anthropic.com/events'), Link(link_type='learn page', url='https://www.anthropic.com/learn')]
https://www.anthropic.com/company
https://www.anthropic.com/team
https://www.anthropic.com/careers
https://www.anthropic.com/news
https://www.anthropic.com/contact-sales
https://www.anthropic.com/events
https://www.anthropic.com/learn


### 3 - gather all details and deploy

In [35]:
def get_web_details(url):
    result = "Landing page: \n<landing_page>\n"
    website = Website(url)
    result += website.get_content()
    result += '\n</landing_page>'
    links = get_links(website)
    print("Found links:", links.links)
    for link in links.links:
        result += f"\n\n{link.link_type}: \n<{link.link_type}>\n"
        result += Website(link.url).get_content()
        result += f"\n</{link.link_type}_page>\n"
    return result

In [36]:
details = get_web_details('https:/anthropic.com')
print(details[:100])

Found links: [Link(link_type='about page', url='https://www.anthropic.com/company'), Link(link_type='careers page', url='https://www.anthropic.com/careers'), Link(link_type='team page', url='https://www.anthropic.com/team')]
Landing page: 
<landing_page>
Webpage Title: 
Home \ Anthropic
Webpage Contents: 
Skip to main conte


In [46]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n\n"
    user_prompt += get_web_details(url)
    user_prompt = user_prompt[:20_000] # Truncate if more than 5,000 characters
    return user_prompt

# get_brochure_user_prompt('Anthropic', 'https:/anthropic.com')

In [70]:
def create_brochure(company_name, url):
    response = client.chat.completions.create(
        model = MODEL,
        messages = [
            {"role": "system", "content": system_prompt}, 
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ]
    )
    result = response.choices[0].message.content.replace("```", "").replace("markdown", "")

    response2 = client.chat.completions.create(
        model = MODEL,
        messages = [
            {"role": "system", "content": "You are a helpful translator. Translate the following text to Spanish. Keep Markdown format."}, 
            {"role": "user", "content": f"<text>\n{result}\n</text>"}
        ]
    )
    result_spanish = response2.choices[0].message.content.replace("```", "").replace("markdown", "")
    display(Markdown(result + '\n\n---\n\n' + result_spanish))
    return result, result_spanish

In [72]:
result_anthropic, result_anthropic_spanish = create_brochure('Anthropic', 'https:/anthropic.com')

Found links: [Link(link_type='company', url='https://www.anthropic.com/company'), Link(link_type='careers page', url='https://www.anthropic.com/careers'), Link(link_type='team', url='https://www.anthropic.com/team'), Link(link_type='news', url='https://www.anthropic.com/news'), Link(link_type='learn', url='https://www.anthropic.com/learn'), Link(link_type='customers', url='https://www.anthropic.com/customers'), Link(link_type='research', url='https://www.anthropic.com/research'), Link(link_type='events', url='https://www.anthropic.com/events')]



# Anthropic: Pioneers in Safety-Driven AI

## Overview
Anthropic is an AI safety and research company based in San Francisco, dedicated to building reliable, interpretable, and steerable AI systems. Our mission is to ensure that AI technologies become a robustly positive force for humanity while understanding and mitigating the potential risks involved.

## Our Products
### Meet Claude
Claude is our flagship AI model family, designed to facilitate various applications ranging from customer support to education. With models like Claude Opus 4 and Claude Sonnet 4, we provide cutting-edge tools geared toward enhancing productivity and creativity across numerous sectors.

### Solutions
- **AI Agents**: Automate tasks with intelligent agents.
- **Coding Assistance**: Simplify coding with our AI tools.
- **Educational Resources**: Enhance learning experiences.

## Commitment to Safety
At Anthropic, we treat AI safety as a science. Our daily research emphasizes responsible scaling and transparency, ensuring that our systems are developed with the highest safety standards. We work collaboratively with policy experts, engineers, and the global community to promote safe AI practices.

## Company Culture
Our culture is defined by our commitment to global good, accountability, and collaboration. We value:
- **Bold action** towards positive outcomes for humanity.
- A **high-trust environment** that emphasizes kindness and direct communication.
- A belief in a **"race to the top"** in AI safety, inspiring industry-wide standards.

### Employee Benefits
We offer a comprehensive benefits package to support the well-being of our team:
- **Health & Wellness**: Comprehensive insurance, fertility benefits, generous parental leave.
- **Compensation**: Competitive salaries with equity options, retirement plans, and life insurance.
- **Additional Support**: Flexibility in time off, wellness stipends, and relocation assistance.

## Careers at Anthropic
Join us in shaping the future of AI! We seek passionate individuals from diverse fields. Whether your background is in machine learning, public policy, or business, there's a place for you at Anthropic. Explore open roles on our **[Careers Page](#)**.

## Contact Us
Interested in learning more about how Anthropic is leading the way in AI safety and innovation? Visit our **[website](#)** or reach out to us for further inquiries or partnership opportunities.

---

*Together, let's build a future where AI technology serves humanity’s best interests.*



---

<text>

# Anthropic: Pioneros en IA Impulsada por la Seguridad

## Resumen
Anthropic es una empresa de investigación y seguridad en IA con sede en San Francisco, dedicada a construir sistemas de IA confiables, interpretables y manejables. Nuestra misión es asegurar que las tecnologías de IA se conviertan en una fuerza positivamente robusta para la humanidad, mientras entendemos y mitigamos los riesgos potenciales involucrados.

## Nuestros Productos
### Conoce a Claude
Claude es nuestra familia de modelos de IA más representativa, diseñada para facilitar diversas aplicaciones que van desde el soporte al cliente hasta la educación. Con modelos como Claude Opus 4 y Claude Sonnet 4, proporcionamos herramientas de vanguardia orientadas a mejorar la productividad y la creatividad en numerosos sectores.

### Soluciones
- **Agentes de IA**: Automatiza tareas con agentes inteligentes.
- **Asistencia en programación**: Simplifica la programación con nuestras herramientas de IA.
- **Recursos educativos**: Mejora las experiencias de aprendizaje.

## Compromiso con la Seguridad
En Anthropic, tratamos la seguridad de la IA como una ciencia. Nuestra investigación diaria enfatiza la escalabilidad responsable y la transparencia, asegurando que nuestros sistemas se desarrollen con los más altos estándares de seguridad. Trabajamos de manera colaborativa con expertos en políticas, ingenieros y la comunidad global para promover prácticas de IA segura.

## Cultura de la Empresa
Nuestra cultura está definida por nuestro compromiso con el bien global, la responsabilidad y la colaboración. Valoramos:
- **Acciones audaces** hacia resultados positivos para la humanidad.
- Un **entorno de alta confianza** que enfatiza la amabilidad y la comunicación directa.
- La creencia en una **“carrera hacia la cima”** en seguridad de IA, inspirando estándares en toda la industria.

### Beneficios para Empleados
Ofrecemos un paquete de beneficios integral para apoyar el bienestar de nuestro equipo:
- **Salud y Bienestar**: Seguro comprensivo, beneficios de fertilidad, generoso permiso parental.
- **Compensación**: Salarios competitivos con opciones de acciones, planes de jubilación y seguro de vida.
- **Apoyo Adicional**: Flexibilidad en el tiempo libre, estipendios de bienestar y asistencia para reubicación.

## Carreras en Anthropic
¡Únete a nosotros para dar forma al futuro de la IA! Buscamos personas apasionadas de diversos campos. Ya sea que tu experiencia sea en aprendizaje automático, políticas públicas o negocios, hay un lugar para ti en Anthropic. Explora roles abiertos en nuestra **[Página de Carreras](#)**.

## Contáctanos
¿Interesado en aprender más sobre cómo Anthropic está liderando el camino en seguridad e innovación en IA? Visita nuestro **[sitio web](#)** o contáctanos para más consultas u oportunidades de asociación.

---

*Juntos, construyamos un futuro donde la tecnología de IA sirva a los mejores intereses de la humanidad.*

</text>

### 4 - Enable Streaming
- return in real time, interactive

In [76]:
def create_stream_brochure(company_name, url):
    stream = client.responses.create(
        model = MODEL,
        input = [
            {"role": "system", "content": system_prompt}, 
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
        stream = True 
    ) # responses api

    response = ""
    display_handle = display(Markdown(""), display_id = True)
    for event in stream:
        if event.type == 'response.output_text.delta':
            response += event.delta
            response = response.replace("```", "").replace("markdown", "")
            # update display
            update_display(Markdown(response), display_id = display_handle.display_id)

    stream2 = client.responses.create(
        model = MODEL,
        input = [
            {"role": "system", "content": "You are a helpful translator. Translate the following text to Spanish. Keep Markdown format."}, 
            {"role": "user", "content": f"<text>\n{response}\n</text>"}
        ],
        stream = True
    )

    response += '\n\n---\n\n\n'

    for event in stream2:
        if event.type == 'response.output_text.delta':
            response += event.delta
            response = response.replace("```", "").replace("markdown", "")
            # update display
            update_display(Markdown(response), display_id = display_handle.display_id)
    return response

In [78]:
res_hugginface = create_stream_brochure('Huggin Face', 'https://huggingface.co/')

Found links: [Link(link_type='careers page', url='https://apply.workable.com/huggingface/'), Link(link_type='company page', url='https://www.linkedin.com/company/huggingface/')]


# Hugging Face Brochure

## About Us
Welcome to **Hugging Face**, the AI community dedicated to building the future. Founded in 2016, we specialize in advancing machine learning, natural language processing, and deep learning technologies. With a strong focus on collaboration, innovation, and open-source software, we are creating a platform where the ML community can thrive.

- **Location**: Paris, France
- **Team Size**: 51-200 Employees
- **Industry**: Software Development
- **Website**: [huggingface.co](https://huggingface.co)

## Our Mission
At Hugging Face, we are on a mission to democratize artificial intelligence and make cutting-edge machine learning accessible to everyone. Our collaborative platform hosts over 1 million models and datasets, promoting knowledge sharing and co-development.

## Products and Services
### Key Offerings:
- **Models**: Access to an unparalleled collection of over 1 million models for various applications.
- **Datasets**: A rich repository of 250k+ datasets for any ML task.
- **Spaces**: A platform for sharing and running ML applications seamlessly.
- **Enterprise Solutions**: Tailored compute and data offerings for teams, with security and dedicated support.

## Company Culture
Hugging Face thrives on a culture of inclusivity, collaboration, and continuous learning. Our team members are passionate about AI and actively participate in open discussions, creative problem-solving, and community projects. We believe in empowering individuals to contribute to shared goals while fostering an environment of respect and innovation.

## Our Community
Join over **50,000 organizations**, including industry giants like Google, Amazon, and Microsoft, who are already utilizing our innovative technologies. We pride ourselves on being a hub for machine learning enthusiasts, researchers, and practitioners.

## Careers at Hugging Face
We are always looking for talented and passionate individuals to join our team. Explore our current job openings in various fields, including:
- Machine Learning Engineers
- Software Developers
- Data Scientists
- Research Engineers

To apply or learn more about our open positions, visit our [Careers Page](https://huggingface.co/careers).

## Join Us
Become part of a vibrant community spearheading the future of artificial intelligence. Whether you're a researcher, developer, or an aspiring ML enthusiast, Hugging Face offers numerous ways to contribute, learn, and grow.

**Together, let’s build the future of AI!**

---

<text>
# Folleto de Hugging Face

## Sobre Nosotros
Bienvenido a **Hugging Face**, la comunidad de IA dedicada a construir el futuro. Fundada en 2016, nos especializamos en avanzar en tecnologías de aprendizaje automático, procesamiento de lenguaje natural y aprendizaje profundo. Con un fuerte enfoque en la colaboración, la innovación y el software de código abierto, estamos creando una plataforma donde la comunidad de ML puede prosperar.

- **Ubicación**: París, Francia
- **Tamaño del Equipo**: 51-200 Empleados
- **Industria**: Desarrollo de Software
- **Sitio Web**: [huggingface.co](https://huggingface.co)

## Nuestra Misión
En Hugging Face, estamos en una misión para democratizar la inteligencia artificial y hacer que el aprendizajede automático de vanguardia sea accesible para todos. Nuestra plataforma colaborativa alberga más de 1 millón de modelos y conjuntos de datos, promoviendo el intercambio de conocimientos y la co-desarrollo.

## Productos y Servicios
### Ofertas Clave:
- **Modelos**: Acceso a una colección sin igual de más de 1 millón de modelos para diversas aplicaciones.
- **Conjuntos de Datos**: Un rico repositorio de más de 250k conjuntos de datos para cualquier tarea de ML.
- **Espacios**: Una plataforma para compartir y ejecutar aplicaciones de ML sin problemas.
- **Soluciones Empresariales**: Ofertas de computación y datos personalizadas para equipos, con seguridad y soporte dedicado.

## Cultura Empresarial
Hugging Face prospera en una cultura de inclusividad, colaboración y aprendizaje continuo. Nuestros miembros del equipo son apasionados por la IA y participan activamente en discusiones abiertas, resolución creativa de problemas y proyectos comunitarios. Creemos en empoderar a los individuos para contribuir a objetivos compartidos mientras fomentamos un ambiente de respeto e innovación.

## Nuestra Comunidad
Únete a más de **50,000 organizaciones**, incluidas gigantes de la industria como Google, Amazon y Microsoft, que ya están utilizando nuestras tecnologías innovadoras. Nos enorgullece ser un centro para entusiastas, investigadores y practicantes del aprendizaje automático.

## Carreras en Hugging Face
Siempre estamos buscando individuos talentosos y apasionados para unirse a nuestro equipo. Explora nuestras vacantes actuales en diversos campos, incluyendo:
- Ingenieros de Aprendizaje Automático
- Desarrolladores de Software
- Científicos de Datos
- Ingenieros de Investigación

Para postularte o aprender más sobre nuestras posiciones abiertas, visita nuestra [Página de Carreras](https://huggingface.co/careers).

## Únete a Nosotros
Conviértete en parte de una comunidad vibrante que lidera el futuro de la inteligencia artificial. Ya seas un investigador, desarrollador o un entusiasta aspirante del ML, Hugging Face ofrece numerosas formas de contribuir, aprender y crecer.

**¡Juntos, construyamos el futuro de la IA!**
</text>