In [1]:
! pip install requests beautifulsoup4




[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
! pip install langchain langchain-community langchain-huggingface




[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
! pip install faiss-cpu sentence-transformers




[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
! pip install groq




[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
! pip install langchain





[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import requests
from bs4 import BeautifulSoup

In [7]:
! pip install python-dotenv




[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
# from langchain.schema import Document
from langchain_core.documents import Document
# from langchain.text_splitter import CharacterTextSplitter
from langchain_text_splitters import CharacterTextSplitter

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

from groq import Groq
import os 
from dotenv import load_dotenv
load_dotenv()

# ================================
# CONFIG
# ================================
URL = "https://www.excelr.com"

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
MODEL_NAME = "llama-3.1-8b-instant"

# ================================
# 1. SCRAPE WEBSITE
# ================================
def scrape_icmr_tenders(url):
    response = requests.get(url, timeout=10)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    texts = []
    for tag in soup.find_all(["p", "li", "a", "td"]):
        text = tag.get_text(strip=True)
        if text and len(text) > 30:
            texts.append(text)

    full_text = "\n".join(texts)
    return full_text


raw_text = scrape_icmr_tenders(URL)

documents = [
    Document(
        page_content=raw_text,
        metadata={"source": URL}
    )
]

# ================================
# 2. CHUNKING (CharacterTextSplitter)
# ================================
text_splitter = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

chunks = text_splitter.split_documents(documents)

# ================================
# 3. EMBEDDINGS (Open Source)
# ================================
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# ================================
# 4. VECTOR STORE (FAISS)
# ================================
vectorstore = FAISS.from_documents(
    documents=chunks,
    embedding=embeddings
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# ================================
# 5. GROQ CHAT COMPLETION
# ================================
client = Groq(api_key=GROQ_API_KEY)

def groq_chat_completion(context, question):
    prompt = f"""
Use the following context to answer the question.
If the answer is not available, say "Information not found on the website".

Context:
{context}

Question:
{question}
"""

    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=800
    )

    return response.choices[0].message.content


# ================================
# 6. RAG QUERY FUNCTION
# ================================
def rag_query(question):
    # docs = retriever.get_relevant_documents(question)
    docs = retriever.invoke(question)
    context = "\n\n".join([d.page_content for d in docs])
    return groq_chat_completion(context, question)


# ================================
# 7. TEST
# ================================
if __name__ == "__main__":
    query = "What are the courses provided ?"
    answer = rag_query(query)
    print("\nAnswer:\n", answer)



Answer:
 The courses provided by ExcelR include:

1. Data Science Course Training
2. Certified Scrum Master (CSM)
3. PMP Certification Course Training
4. PMI-ACP Certification Course Training
5. Internet of Things (IoT) Certification Training
6. Python
7. Digital Marketing Certification Course Training
8. RPA
9. Machine Learning Certification Course Training
10. Blockchain Certification Course Training

Additionally, ExcelR offers courses in various emerging technologies, including:

1. Artificial Intelligence
2. Machine Learning
3. AR / VR
4. IR 4.0
5. IoT
6. Block Chain
7. Cyber Security
8. Financial Analytics
9. Retail / Supply Chain Analytics
10. Social Media and Web Analytics
11. Forecasting Analytics
12. Text Mining and NLP
13. Business Intelligence
14. Digital Marketing
15. RPA
16. AWS
17. Cloud Computing
18. Microsoft Azure
19. Google Cloud Platform

ExcelR also offers courses in Quality Management, including:

1. Lean Six Sigma Green Belt
2. Lean Six Sigma Black Belt
3. ISO
4

In [None]:
# Excelr Website
#      ↓
# Web Scraping (BeautifulSoup)
#      ↓
# CharacterTextSplitter
#      ↓
# HuggingFace Embeddings
#      ↓
# FAISS Vector DB
#      ↓
# Groq (llama-3.1-8b-instant)

In [None]:
documents