In [1]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [3]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')
MODEL = 'gpt-4o-mini'
openai = OpenAI()

In [17]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [None]:
hefas = Website("https://www.deanza.edu/hefas/")
hefas.links = ["https://www.deanza.edu" + link for link in hefas.links if link.startswith('/hefas')]
hefas.get_contents()

In [10]:
output_dir = "HEFAS_Knowledge"
os.makedirs(output_dir, exist_ok=True)

In [27]:
system_prompt = """
You are provided with information about an institution. Your task is to structure this information into a well-organized Markdown document that clearly outlines the institution’s key details.

## **Instructions for Structuring the Markdown Document:**
1. **Use a clear and hierarchical structure**  
   - Start with a `#` title header for the general contents.
   - Use `##` for main sections and `###` for sub-sections.
   - Ensure that content is logically grouped and easy to read.

2. **Use Markdown formatting properly**  
   - Format links as `[Title](URL)`.
   - Use bullet points `-` or numbered lists `1.` where appropriate.

3. **Define the Core Sections**  
   - **Introduction/About**: Brief summary of the institution's mission and purpose.
   - **Programs & Services**: Academic programs, student support, and special initiatives.
   - **Key Resources**: Financial aid, scholarships, student services.
   - **Get Involved**: Volunteering, events, and community participation.
   - **Contact Information**: Office locations, emails, phone numbers, and social media.
   - **Additional Information**: Policies, campus maps, FAQs.

4. **Ensure Logical Flow**  
   - Prioritize the most important sections first.
   - Maintain consistency in formatting.
   - Remove redundant or unnecessary information.

## **Example Markdown Output:**

```markdown
# Institution Name

## About Us
[Brief description of the institution]
[Mission Statement]

## Programs & Services
- **Academic Programs**
- **Student Support Services**
- **Special Initiatives**
  
[Learn more](https://example.com/programs)

## Key Resources
- **Scholarships & Financial Aid**
- **Library & Study Resources**
- **Counseling & Career Support**

[View all resources](https://example.com/resources)

## Get Involved
- **Internships & Volunteering**
- **Student Organizations**
- **Annual Events & Workshops**

[Join our programs](https://example.com/get-involved)

## Contact Us
📍 **Location:** [Campus Address]  
📞 **Phone:** [Phone Number]  
📧 **Email:** [Email Address]  
🌐 **Website:** [Official Website]  
📷 **Social Media:** [Instagram | Twitter | Facebook]

"""

In [25]:
def get_links_user_prompt(contents):
    user_prompt = f"Here is the contents of the HEFAS website - "
    user_prompt += "please help me to generate the markdown format text of this website\n"
    user_prompt += contents
    return user_prompt

In [None]:
def get_document(website):
    message = [
        {"role":"system", "message":system_prompt},
        {"role":"user", "message":get_links_user_prompt(website.get_contents())}
    ]

    response = openai.chat.completions.create(
        model=MODEL,
        messages=message,
    )
    

In [None]:
def generate_knowledge_base(links):
    
    folder_path = "HEFAS_Knowledge/HEFAS"
    os.makedirs(folder_path, exist_ok=True)
    
    for link in links:
        website = Website(link)
        file_name = website.title
        file_path = os.path.join(folder_path, file_name) 
        with open("file_path", "w", encoding="utf-8") as file:
            
            file.write