Import neccessary libraries

In [84]:
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.document_loaders import WebBaseLoader

In [85]:
# Define the smart start urls


smart_start_urls = [
    'https://smartstart.org.za/index.html',
    'https://smartstart.org.za/news.html',
    'https://smartstart.org.za/funders.html',
    'https://smartstart.org.za/partners.html',
    'https://smartstart.org.za/care-givers.html',
    'https://smartstart.org.za/contact.html'  
]

In [86]:
# Define function to fetch load and clean html content from the website

In [87]:


def fetch_clean_text(url: str) -> str:
    """
    Fetches HTML content from the URL and cleans it by:
    - Removing script, style, nav, and footer tags
    - Extracting visible text
    Returns the cleaned text as a string.
    """
    response = requests.get(url, headers={"User-Agent": "SmartStartRAGBot/1.0 (https://smartstart.org.za)"})
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch {url}, status code: {response.status_code}")
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Remove unwanted tags
    for tag in ['script', 'style', 'nav', 'footer']:
        for match in soup.find_all(tag):
            match.decompose()
    
    # Get visible text
    cleaned_text = soup.get_text(separator=' ', strip=True)
    
    return cleaned_text


Fetch and clean each url data and store in langchain Document object

In [88]:
documents = []
for url in smart_start_urls:
    print(f'Fetching and cleaning: {url}')
    try:
        text = fetch_clean_text(url)
        doc = Document(page_content =text, metadata ={'source':url})

        documents.append(doc)
    except Exception as e:
        print(f'Error processing:,{url}:{e}')
print(f'\n✅ Loaded and cleaned {len(documents)} documents.')

Fetching and cleaning: https://smartstart.org.za/index.html
Fetching and cleaning: https://smartstart.org.za/news.html
Fetching and cleaning: https://smartstart.org.za/funders.html
Fetching and cleaning: https://smartstart.org.za/partners.html
Fetching and cleaning: https://smartstart.org.za/care-givers.html
Fetching and cleaning: https://smartstart.org.za/contact.html

✅ Loaded and cleaned 6 documents.


In [89]:
# View the first document
print(documents[0].page_content[:1000])  # Print first 1000 characters (to avoid overload)
print("\nMetadata:", documents[0].metadata)


SmartStart Welcome to SmartStart Our latest Child Outcomes Evaluation has been published. View now Close The Brightest Futures Have a SmartStart Giving the most underserved children access to quality early learning. Learn more Brightening Futures, For Every Child We have a Bold Vision for Our Children At SmartStart, we believe every child deserves the opportunity to learn, grow, and shine â no matter their background. Quality early learning lays the foundation for a brighter future, and weâre committed to making it accessible to all, especially in our most underserved communities. Thatâs why weâre on a mission to reach one million children aged 3 to 5 every year by 2030 , giving them the best start in life and helping to build a better future for South Africa. Let's make a difference Join the Early Learning Movement Weâd love to give every child the opportunity to succeed, but we canât do it alone. Whether youâre passionate about teaching, eager to support, or looking for

Load Together AI API keys 

In [90]:
from dotenv import load_dotenv
import os

load_dotenv()  # This loads variables from .env into environment
api_key = os.getenv("TOGETHER_API_KEY")

print("✅ API key loaded:", bool(api_key))


✅ API key loaded: True


In [91]:
from langchain.embeddings.base import Embeddings

In [92]:

import requests
from langchain.embeddings.base import Embeddings
import os
import requests
from typing import List

from langchain.embeddings import HuggingFaceEmbeddings


In [93]:
#Build embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [94]:
from langchain.vectorstores import FAISS

In [96]:
# Create FAISS vectorstore from your documents
vectorstore = FAISS.from_documents(documents, embedding_model)


In [97]:

# Save it locally for reuse
vectorstore.save_local("smartstart_faiss_index")

print("✅ Together.ai FAISS index created and saved.")

✅ Together.ai FAISS index created and saved.


In [117]:
#Load save FAISS iNDEX

vectorstore = FAISS.load_local('smartstart_faiss_index',
                              embeddings =embeddings,
                               allow_dangerous_deserialization=True
)
print('✅vectortsore successfuly loaded')


✅vectortsore successfuly loaded


In [118]:
#Create a retriever
retriever = vectorstore.as_retriever(search_kwargs = {'k':3})


In [124]:
# LLM set up -llm wrapper

from langchain.llms import Together

llm = Together(
    model="mistralai/Mistral-7B-Instruct-v0.1",
    temperature=0.3,
    max_tokens=150
)


In [122]:
# Create a retrieveal chain q&a

from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)


In [126]:
# Ask a question
query = "Whos the ceo of smartstart?"
result = qa_chain({"query": query})
print(result["result"])


 Grace Matlhape is the Chief Executive Officer of SmartStart.
