In [50]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from urllib.parse import urljoin, quote
import re
import google.generativeai as genai

In [51]:
load_dotenv()

api_key = os.getenv("GOOGLE_API_KEY")

In [52]:
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-1.5-flash")

In [53]:
class Website:
    url: str
    title: str
    body: str
    links: List[str]
    text: str

    def __init__(self, url: str):
        self.url = url
        response = requests.get(url)
        self.body = response.content
        soup = BeautifulSoup(response.content, "html.parser")
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup(["script", "style","img","input"]):
                irrelevant.decompose()
            self.text=soup.body.get_text(separator="\n",strip=True)
        else:
            self.text = ""
        links = [link.get("href") for link in soup.find_all("a") if link.get("href")]
        self.links = [urljoin(url, quote(link, safe=':/')) for link in links]
    
    def get_contents(self):
        return f"Webpage Title: \n{self.title}\nWebpage Content: \n{self.text}\n\n"

In [54]:
shivansh=Website("https://flowcv.me/shivansh-uppal")
print(shivansh.links)

['https://www.linkedin.com/in/shivanshuppal/', 'https://github.com/Shivansh-Uppal', 'https://flowcv.me/cdn-cgi/l/email-protection%23d5a6bdbca3b4bba6bda2b4ac95b2b8b4bcb9fbb6bab8', 'https://flowcv.me/shivansh-uppal/Shivansh%20Uppal-Resume-q7fmb8hadm.pdf']


Use Gemini to read links on a webpage and respond in structured JSON

In [55]:
link_system_prompt= "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant ti include in a brouchure about the company. \
such as links to an about page or a company page or Careers/Jobs pages. \n"

link_system_prompt+= "You should respond in JSON as in this example:"

link_system_prompt+= """
{
    "links": [
    {"type": "about page", "url": "https://full.url/goes/here/about"},
    {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
} 
"""

In [56]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant ti include in a brouchure about the company. such as links to an about page or a company page or Careers/Jobs pages. 
You should respond in JSON as in this example:
{
    "links": [
    {"type": "about page", "url": "https://full.url/goes/here/about"},
    {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
} 



In [57]:
def get_links_user_prompt(website):
    user_prompt=f"Here is the list of links on the website of {website.url} -"
    user_prompt+= "please decide which of these are relvant web links for a broucher about the company, respond with the full https URL: \
Don not include Terms of service, Privacy, email links. \n"
    user_prompt+= "Links (some might be relevant links):\n"
    user_prompt+= "\n".join(website.links)
    return user_prompt

In [58]:
print(get_links_user_prompt(shivansh))

Here is the list of links on the website of https://flowcv.me/shivansh-uppal -please decide which of these are relvant web links for a broucher about the company, respond with the full https URL: Don not include Terms of service, Privacy, email links. 
Links (some might be relevant links):
https://www.linkedin.com/in/shivanshuppal/
https://github.com/Shivansh-Uppal
https://flowcv.me/cdn-cgi/l/email-protection%23d5a6bdbca3b4bba6bda2b4ac95b2b8b4bcb9fbb6bab8
https://flowcv.me/shivansh-uppal/Shivansh%20Uppal-Resume-q7fmb8hadm.pdf


In [59]:
print(f"{link_system_prompt}\n\n{get_links_user_prompt(shivansh)}")

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant ti include in a brouchure about the company. such as links to an about page or a company page or Careers/Jobs pages. 
You should respond in JSON as in this example:
{
    "links": [
    {"type": "about page", "url": "https://full.url/goes/here/about"},
    {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
} 


Here is the list of links on the website of https://flowcv.me/shivansh-uppal -please decide which of these are relvant web links for a broucher about the company, respond with the full https URL: Don not include Terms of service, Privacy, email links. 
Links (some might be relevant links):
https://www.linkedin.com/in/shivanshuppal/
https://github.com/Shivansh-Uppal
https://flowcv.me/cdn-cgi/l/email-protection%23d5a6bdbca3b4bba6bda2b4ac95b2b8b4bcb9fbb6bab8
https://flowcv.me/shivansh-uppal/Shivansh%20Uppal-Resume-q7fmb8hadm.pdf


In [78]:
def get_links(url):
    website=Website(url)
    completion=model.generate_content(
        contents=f"{link_system_prompt}\n\n{get_links_user_prompt(website)}"
    )
    result=completion.text
    json_match = re.search(r'\{.*\}', result, re.DOTALL)
    if json_match:
        json_string = json_match.group(0)  # Extract only JSON part
        parsed_json = json.loads(json_string)
        return parsed_json
    else:
        print("No valid JSON found!")

In [79]:
anthropic=Website("https://www.anthropic.com/")
print(anthropic.links)

['https://www.anthropic.com/', 'https://www.anthropic.com/claude', 'https://www.anthropic.com/claude', 'https://www.anthropic.com/team', 'https://www.anthropic.com/enterprise', 'https://www.anthropic.com/api', 'https://www.anthropic.com/pricing', 'https://www.anthropic.com/research', 'https://www.anthropic.com/company', 'https://www.anthropic.com/careers', 'https://www.anthropic.com/news', 'https://claude.ai/', 'https://www.anthropic.com/research%23entry:8%401:url', 'https://www.anthropic.com/claude', 'https://claude.ai/', 'https://www.anthropic.com/api', 'https://www.anthropic.com/news/3-5-models-and-computer-use', 'https://www.anthropic.com/claude/sonnet', 'https://www.anthropic.com/claude/haiku', 'https://www.anthropic.com/news/claude-for-enterprise', 'https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback', 'https://www.anthropic.com/news/core-views-on-ai-safety', 'https://www.anthropic.com/jobs', 'https://www.anthropic.com/', 'https://www.anthropic.com/

In [80]:
get_links("https://anthropic.com")

{'links': [{'type': 'about page', 'url': 'https://anthropic.com/company'},
  {'type': 'careers page', 'url': 'https://anthropic.com/careers'},
  {'type': 'research page', 'url': 'https://anthropic.com/research'},
  {'type': 'product page', 'url': 'https://anthropic.com/claude'},
  {'type': 'enterprise page', 'url': 'https://anthropic.com/enterprise'},
  {'type': 'news page', 'url': 'https://anthropic.com/news'}]}