In [76]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse



In [66]:
# defining headers 
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

In [93]:
class Website:
    def __init__(self, url):
        self.base_url = url
        self.visited = set()
        self.headers = headers
        self.all_texts = []   # to store all page texts
        self.crawl(url)
        self.text = "\n\n".join(page_text for _, page_text in self.all_texts)  # After crawling, create a combined text
        
    def crawl(self, url): #crawl all the links within the site
        if url in self.visited:
            return
        print(f"Crawling: {url} (Visited {len(self.visited)} pages so far)")

        
        self.visited.add(url)
        try:
            response = requests.get(url, headers=self.headers, timeout=5)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Save the page title and cleaned text
            title = soup.title.string if soup.title else "No title found"
            
            # Set self.title once, from the homepage
            if not hasattr(self, 'title'):
                self.title = title
                
            for irrevelent in soup.body(["script", "style", "img", "input"]):
                irrevelent.decompose()
            text = soup.body.get_text(separator="\n", strip=True)
            self.all_texts.append((title, text))
            
            # Find all links
            for link_tag in soup.find_all('a', href=True):
                href = link_tag['href']
                full_url = urljoin(url, href)  # join relative links to base
                
                # Stay within the same domain
                if self.is_internal(full_url):
                    self.crawl(full_url)
        
        except Exception as e:
            print(f"Failed to crawl {url}: {e}")
    
    def is_internal(self, url):
        # Make sure the link is from the same domain
        base_domain = urlparse(self.base_url).netloc
        link_domain = urlparse(url).netloc
        return base_domain == link_domain
   

In [94]:
site = Website("https://roshanchaudhary001.com.np/")


Crawling: https://roshanchaudhary001.com.np/ (Visited 0 pages so far)
Crawling: https://roshanchaudhary001.com.np/About (Visited 1 pages so far)
Crawling: https://roshanchaudhary001.com.np/Projects (Visited 2 pages so far)
Crawling: https://roshanchaudhary001.com.np/Blogs (Visited 3 pages so far)
Crawling: https://roshanchaudhary001.com.np/roshan_new_cv.pdf (Visited 4 pages so far)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Failed to crawl https://roshanchaudhary001.com.np/roshan_new_cv.pdf: 'NoneType' object is not callable


In [95]:
#system prompt for model
system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

In [96]:
# A function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [97]:
messages = [
    {
        "role":"user",
        "content": user_prompt_for(site)
    }
]
print(messages)

[{'role': 'user', 'content': 'You are looking at a website titled Roshan Chaudhary\nThe contents of this website is as follows; please provide a short summary of this website in markdown. If it includes news or announcements, then summarize these too.\n\nAbout\nProjects\nBlogs\nDownload CV\nHi, my name is\nRoshan Chaudhary.\nAspiring Software Developer.\nI\'m a software engineer specializing in building (and occasionally designing) exceptional digital experiences. Currently, I\'m focused on building accessible, human-centered products.\nSee More About Me\n\nAbout\nProjects\nBlogs\nDownload CV\nHello! My name is\nRoshan Chaudhary\nand I am currently learning software development and how things work around in the world of computer. I am proficient in frontend as well as backend with 1.5 yrs of experience. My primary proficiency is in Javascript. You can reach me at\nroshanchau001@gmail.com\nHere are some technologies that I use recently:\nJavascript\nTypescript\nReact\nNextJS\nNode.js\nE

In [98]:
#now importing ollama locally
import ollama


In [99]:
MODEL = "llama3.2" #i have this model in my device

In [100]:
response = ollama.chat(model=MODEL, messages=messages)
print(response.message.content)

**Summary of Roshan Chaudhary's Website**

### About

* Roshan Chaudhary is an aspiring software developer with expertise in building accessible and human-centered digital experiences.

### Projects

* Currently focused on building accessible products
* No specific projects listed

### Blogs

* Published a blog post titled "Scaling Node.js Application Using Clustering and PM2" on April 19, 2025
* The blog post explains how to scale a Node.js app using the Cluster module and PM2 process manager for better performance and reliability.

### News/Announcements

* No recent news or announcements are available on this website.
