In [23]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [24]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate

In [31]:
def initialize_llm():
    llm = ChatGroq(
        model='llama-3.3-70b-versatile',
        groq_api_key=os.getenv('GROQ_API_KEY'),
        temperature=0.8    
    )
    return llm


def create_vector_db():
    loader = DirectoryLoader('./data', glob='*.pdf', loader_cls=PyPDFLoader)
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    documents = text_splitter.split_documents(docs)
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    vector_db = Chroma.from_documents(documents, embeddings, persist_directory='.chroma_db')
    vector_db.persist()

    print("ChromaDB created and saved.")
    return vector_db

In [None]:
from langchain.schema import Document

SIMILARITY_THRESHOLD = 0.85   

def get_relevant_docs(query, vector_db, k=3):
    """Returns documents only if similarity is above threshold"""
    docs_and_scores = vector_db.similarity_search_with_score(query, k=k)
    print("\nüîç Similarity Scores:")
    for doc, score in docs_and_scores:
        print(f"Score: {score:.4f} | Content Preview: {doc.page_content[:10]}...")
        print()
        print()
        break

    
    filtered_docs = [
        doc for doc, score in docs_and_scores if score <= SIMILARITY_THRESHOLD
    ]
    
    return filtered_docs

In [33]:
def setup_qa_chain(vector_db, llm):
    def custom_retriever(query):
        docs = get_relevant_docs(query, vector_db)
        if not docs:  
            return None
        return docs

    prompt_template = """
You are a cybersecurity expert chatbot. Answer ONLY using the provided context from the database.
If the answer is not present in context, say: "This information is not in our database."

Context:
{context}

User: {question}
Chatbot:"""

    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=['context', 'question']
    )
    
    def answer(query):
        docs = custom_retriever(query)
        
        # If no matching documents found
        if docs is None:
            return "This information is not in our database."

        # Else run RAG chain
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type='stuff',
            retriever=vector_db.as_retriever(),
            chain_type_kwargs={'prompt': prompt}
        )
        return qa_chain.run(query)

    return answer