In [1]:
# imports 
import os 
import requests 
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown,display
from openai import OpenAI

In [2]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')
openai = OpenAI()

In [3]:
# headers = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
# }

# class Website_scraper:
#     def __init__(self, url):
#         self.url = url
#         response = requests.get(url, headers=headers)
#         soup = BeautifulSoup(response.content, "html.parser")
#         self.title = soup.title.string if soup.title else "No title found"
        
#         # Fix 1: Use decompose() method (not decompose without parentheses)
#         for irrelevant in soup.body(["script", "style", "img", "input"]):
#             irrelevant.decompose()
        
#         # Get text after removing irrelevant elements
#         self.text = soup.body.get_text(separator="\n", strip=True)


import re

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website_scraper:
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.soup = BeautifulSoup(response.content, "html.parser")
        self.title = self.soup.title.string if self.soup.title else "No title found"
        
        # Extract emails before removing elements
        self.emails = self.extract_emails()
        
        # Remove irrelevant elements
        for irrelevant in self.soup.find_all(["script", "style", "img", "input"]):
            irrelevant.decompose()
        
        # Get the cleaned text
        self.text = self.soup.get_text(separator="\n", strip=True)
        
        # Replace obfuscated emails in text
        self.text = self.replace_obfuscated_emails(self.text)
    
    def extract_emails(self):
        """Extract email addresses from the page"""
        emails = []
        
        # Method 1: Look for mailto links (most reliable)
        mailto_links = self.soup.find_all("a", href=lambda href: href and href.startswith("mailto:"))
        for link in mailto_links:
            email = link["href"].replace("mailto:", "").split("?")[0]  # Remove any parameters
            emails.append(email)
        
        # Method 2: Look for email text in anchor tags
        email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
        for anchor in self.soup.find_all("a"):
            if anchor.string and re.match(email_pattern, anchor.string.strip()):
                emails.append(anchor.string.strip())
        
        # Method 3: Look for email pattern in all text
        for text in self.soup.stripped_strings:
            found_emails = re.findall(email_pattern, text)
            emails.extend(found_emails)
        
        # Filter out common false positives
        filtered_emails = [email for email in emails if not any(
            skip in email for skip in ["example.com", "domain.com", "user@example"])]
        
        return list(set(filtered_emails))  # Remove duplicates
    
    def replace_obfuscated_emails(self, text):
        """Replace obfuscated email patterns with real emails"""
        # Patterns of obfuscated emails
        obfuscated_patterns = [
            r'\[email@protected\]', 
            r'\[email protected\]', 
            r'\[email&#160;protected\]',
            r'\[email protected\]'
        ]
        
        # If we found real emails, use them for replacement
        if self.emails:
            for pattern in obfuscated_patterns:
                text = re.sub(pattern, self.emails[0], text)
        # Otherwise use domain-based reconstruction
        else:
            domain = self.url.split("//")[-1].split("/")[0]
            if domain.startswith("www."):
                domain = domain[4:]
            default_email = f"team@{domain}"  # Based on your finding
            
            for pattern in obfuscated_patterns:
                text = re.sub(pattern, default_email, text)
        
        return text


In [4]:
system_prompt ="You are an assistant that analyzes and find contect information from webpages\
the text from the webpage is scraped and given to you as prompt, it is important that you don't give any wrong information, if you can't find the information in the text say didn't find\
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

In [5]:
# function 01: 

def user_prompt_for(website_scraper):
    user_prompt = f"you are looking at a website titled {website_scraper.title}"
    user_prompt += "\n the contents of the website is as follows: \
please provide important contact of this website in markdown.\
if it includes methos for contacting, then summarize these too.\n\n"
    user_prompt += website_scraper.text
    return user_prompt 

In [6]:
# function 02: 
def prompt_payload(website_scraper):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website_scraper)}
    ]


In [7]:
# function 03:
def gpt_response(url):
    website_scraper = Website_scraper(url)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = prompt_payload(website_scraper)
    )
    return response.choices[0].message.content


In [8]:
# function 04: 
def display_gpt_response(url):
    response = gpt_response(url)
    display(Markdown(response))


In [9]:
# call the result 
display_gpt_response("https://and-element.com/")


# Contact Information for &Element

**Address:**  
S9, Innovation Centre  
Boundary Road  
Colchester, Essex, CO4 3ZQ

**Email:**  
[info@andElement.com](mailto:info@andElement.com)

**Phone:**  
01206 259355

**Contact Methods Summary:**  
- You can *book a free consultation* for a review of your digital presence to understand issues and build a strategy for your online success.
- You can also *call* the phone number provided for direct inquiries.

If you need any additional information, please let me know!

In [10]:
ed = Website_scraper("https://and-element.com/")
print(ed.title)
print(ed.text)

Web Development and Digital Marketing Agency • &Element
Web Development and Digital Marketing Agency • &Element
&Element
About
Services
Projects
Insights
Contact Us
We are an
Innovative
Web Development Agency in Essex
We are &Element, an award-winning creative agency specialising in web development, artificial intelligence and app development services alongside SEO, brand strategy and UX reviews.
Discover &Element
We’ve helped hundreds of businesses digitally optimise their company and increase profits.
9+
9+ years of experience
Our team is hyper-focused on helping businesses optimise their internal cloud platforms and growth their digital presence.
230% average increase in ROI across our suite of services.
235+ projects completed since we were founded in 2015.
45m people reached across the world through work.
We are a partner you can rely on covering every aspect of your digital strategy.
0
1
Web Development
Specialising in React.js websites and dashboards, we build bespoke responsive