In [76]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse



In [66]:
# defining headers 
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

In [87]:
class Website:
    def __init__(self, url):
        self.base_url = url
        self.visited = set()
        self.headers = headers
        self.all_texts = []   # to store all page texts
        self.crawl(url)
        self.text = "\n\n".join(page_text for _, page_text in self.all_texts)  # After crawling, create a combined text
        
    def crawl(self, url): #crawl all the links within the site
        if url in self.visited:
            return
        print(f"Crawling: {url} (Visited {len(self.visited)} pages so far)")

        
        self.visited.add(url)
        try:
            response = requests.get(url, headers=self.headers, timeout=5)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Save the page title and cleaned text
            title = soup.title.string if soup.title else "No title found"
            
            # Set self.title once, from the homepage
            if not hasattr(self, 'title'):
                self.title = title
                
            for irrevelent in soup.body(["script", "style", "img", "input"]):
                irrevelent.decompose()
            text = soup.body.get_text(separator="\n", strip=True)
            self.all_texts.append((title, text))
            
            # Find all links
            for link_tag in soup.find_all('a', href=True):
                href = link_tag['href']
                full_url = urljoin(url, href)  # join relative links to base
                
                # Stay within the same domain
                if self.is_internal(full_url):
                    self.crawl(full_url)
        
        except Exception as e:
            print(f"Failed to crawl {url}: {e}")
    
    def is_internal(self, url):
        # Make sure the link is from the same domain
        base_domain = urlparse(self.base_url).netloc
        link_domain = urlparse(url).netloc
        return base_domain == link_domain
   

In [88]:
site = Website("https://roshanchaudhary001.com.np/")


Crawling: https://roshanchaudhary001.com.np/ (Visited 0 pages so far)
Crawling: https://roshanchaudhary001.com.np/About (Visited 1 pages so far)
Crawling: https://roshanchaudhary001.com.np/Projects (Visited 2 pages so far)
Crawling: https://roshanchaudhary001.com.np/Blogs (Visited 3 pages so far)
Crawling: https://roshanchaudhary001.com.np/roshan_new_cv.pdf (Visited 4 pages so far)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Failed to crawl https://roshanchaudhary001.com.np/roshan_new_cv.pdf: 'NoneType' object is not callable


In [89]:
#system prompt for model
system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

In [90]:
# A function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [91]:
messages = [
    {
        "role":"user",
        "content": user_prompt_for(site)
    }
]
print(messages)

AttributeError: 'Website' object has no attribute 'text'

In [92]:
#now importing ollama locally
import ollama


In [None]:
MODEL = "llama3.2" #i have this model in my device

In [75]:
response = ollama.chat(model=MODEL, messages=messages)
print(response.message.content)

**No Title Found Website Summary**

### Overview

The website "No title found" appears to be a cybersecurity awareness and education platform focused on helping individuals and organizations protect themselves from cyber threats.

### Features

* **Tools**: The website offers various tools, including:
	+ Find My IP: to check the user's public IP address
	+ Cyber Leak Checker: to check if email or personal information has been compromised in known data breaches
	+ Is My Website Compromised?: to perform basic security checks on a website
* **Reporting**: Users can report suspected cybercrime through a chatbot for further investigation and action.
* **Training**: The platform provides training sessions on cybersecurity best practices, including phishing email simulators and incident response plans.

### News and Announcements

* A breach alert system has been launched to notify users of potential security threats.
* Cyber Alert Nepal has been featured in various media outlets for its work