In [2]:
import os
import streamlit as st
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain.text_splitter import CharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import FireCrawlLoader
from langchain.docstore.document import Document
from langchain.vectorstores.faiss import FAISS
from langchain_groq import ChatGroq

In [4]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyCMtgjCDQl7iBJsLa2iGTH0KBmsIw3UGFo"
DB_DIR = "faiss"

In [5]:
def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.read().split('\n')

    # Combine every two lines into one document
    docs = []
    for i in range(0, len(lines), 2):
        combined_content = '\n'.join(line.strip() for line in lines[i:i + 2] if line.strip())
        if combined_content:
            docs.append(Document(page_content=combined_content))

    # Initialize embeddings
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

    # Create a FAISS vector database from the documents
    vectordb = FAISS.from_documents(documents=docs, embedding=embeddings)
    vectordb.save_local(DB_DIR)

    return vectordb

In [6]:
vector_db = process_file(r"T:\Programmingg\Maktek\Chatbot\data2.txt")

In [7]:
system_template = """You are a knowledgeable assistant tasked with providing detailed information and guidance about the Construction Industry Scheme (CIS) as it was implemented post-April 2007. The data you will be working with is structured into different sections and content areas, each dealing with specific aspects of the scheme, such as contractor registration, subcontractor verification, deductions, compliance, and relevant legislation.

When responding to user queries, you should use the following guidelines:

1- Content Structure: The information is organized into numbered sections, each covering a different topic within the CIS. Begin by identifying the relevant section or content area based on the user's query.

2- Contextual Understanding: Use the provided content to deliver accurate, concise, and contextually appropriate answers. If a user asks about a specific part of the scheme (e.g., subcontractor verification or monthly returns), focus on the relevant section and summarize key points.

3- User Guidance: If the user's question relates to where to find certain information within the CIS documentation, guide them to the appropriate section (e.g., 'For details on subcontractor registration, refer to CISR40000').

4- Handling Feedback: If the user is unsatisfied with the response or provides feedback, acknowledge their input and attempt to refine your answer by pulling from different relevant sections.

5- Cautions: Do not attempt to provide legal advice or make assumptions beyond the information provided. If the answer is not within your scope, inform the user that the query is outside your remit.

6- Clarity and Precision: Keep your responses clear and precise. If a term or concept is complex, provide a brief explanation or definition to aid understanding.
"""

In [13]:
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]

prompt = ChatPromptTemplate.from_messages(messages)

In [14]:
def process_query(vectordb, query, chat_history):
    # Create a retriever from the FAISS vector database
    retriever = vectordb.as_retriever(search_kwargs={"k": 5})

    # Use a Llama3-70b model
    llm = ChatGroq(
        api_key="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",
        temperature=0,
        model="llama-3.1-70b-versatile",
    )

    # Create a ConversationalRetrievalChain with a StuffedDocumentChain
    chain = ConversationalRetrievalChain.from_llm(
        llm,
        retriever=retriever,
        return_source_documents=True,
        chain_type="stuff",
        verbose=True,
    )

    # Format chat history to a list of tuples
    formatted_chat_history = [(item['question'], item['answer']) for item in chat_history]

    # Run the prompt and return the response
    response = chain({"question": query, "chat_history": formatted_chat_history})

    return response