In [1]:
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

from typing import Union, List

from Database.Database_Weaviate import Database_Weaviate
from LLM.LLM_GGUF import LLM_GGUF

from langchain_weaviate.vectorstores import WeaviateVectorStore

In [10]:
class RAG_Bot:
    def __init__(self, collection_names=['Uk', 'Wales', 'NothernIreland', 'Scotland'], text_splitter='SpaCy', embedding_model="SentenceTransformers"):
        """
        Initializes the RAG_Bot object.
        
        Args:
            collection_names (list, optional): A list of collection names. Defaults to ['Uk', 'Wales', 'Nothernireland', 'Scotland'].
        """
        self.vector_db = Database_Weaviate(collection_names=collection_names, text_splitter=text_splitter, embedding_model=embedding_model)
        self.llm = LLM_GGUF()

    def add_text(self, collection_name, text, metadata=None):
        """
        Adds text data to a specified collection in the Weaviate database.
        
        Args:
            collection_name (str): The name of the collection in the database.
            text (str): The text data to be added.
            metadata (dict): Additional metadata associated with the text.
        """
        self.vector_db.vector_store = WeaviateVectorStore(
            client=self.vector_db.client,
            index_name=collection_name,
            text_key="text",
            embedding=self.vector_db.embeddings,
        )
        
        self.vector_db.add_text_to_db(
            collection_name=collection_name,
            text=text,
            metadata=metadata
        )

    def __collection_routing(self, query) -> Union[str, List[str], None]:
        """
        This function will route the query to the correct collection.
        
        Args:
            query (str): The query to be routed.

        Returns:
            Union[str, None]: The collection name or None if no collection matches the query.
        """

        def check_for_existence_of_collection_names(query:str, collection_names:List[str]=['Uk', 'Wales', 'Nothernireland', 'Scotland']) -> Union[str, None]:
            Existing_collection_names = []
            for collection_name in collection_names:
                if collection_name.lower() in query.lower():
                    Existing_collection_names.append(collection_name)
            if len(Existing_collection_names) > 0:
                return Existing_collection_names
            else:
                return None
            
        mentioned_collections = check_for_existence_of_collection_names(query)
        if mentioned_collections == None:
            return None
        elif mentioned_collections != None and mentioned_collections != [] and len(mentioned_collections) >= 1:
            return mentioned_collections

    def query(self, query:str, k:int=1, search_type='Hybrid'):
        """
        Performs a RAG query on the specified collection using the Saul LLM

        Args:
            query (str): The query to search for similar documents.
            k (int, optional): The number of documents to return. Defaults to 1.

        Returns:
            Response: The response from the LLM.
        """

        Collection_to_query_from = self.__collection_routing(query)
        print(f'Collection_to_query_from: {Collection_to_query_from}')

        if not isinstance(Collection_to_query_from, list) and Collection_to_query_from == None:
            print('There was no collection mentioned in the query. Kindly mention a collection name/s for the query to be executed.')

        elif isinstance(Collection_to_query_from, list):
            self.__query_all(query=query, k=k, collection_names=Collection_to_query_from, search_type=search_type)

    def __query_one(self, collection_name, query, k=1):
        """
        Performs a RAG query on the specified collection using the Saul LLM.
        
        Args:
            collection_name (str): The name of the collection in the database.
            query (str): The query to search for similar documents.
            k (int, optional): The number of documents to return. Defaults to 1.
        
        Returns:
            None
        
        Prints the similarity score and the content of the top k documents that match the query.
        """
        
        #Validate existence of the collection itself first.
        Validity = self.is_collection_empty(collection_name)
        print(f'The Collection: {collection_name} is empty(0)/Not Empty(1): {Validity}')
        
        if not Validity:
            # Creating a WeaviateVectorStore at runtime because we don't know the collection name beforehand
            self.vector_db.vector_store = WeaviateVectorStore(
                client=self.vector_db.client,
                index_name=collection_name,
                text_key="text",
                embedding=self.vector_db.embeddings,
            )
            
            # Get the current WeaviateVectorStore
            current_db = self.vector_db.vector_store
            
            # Create a retriever for the current database
            retriever = current_db.as_retriever(
                search_kwargs={"k": k})

            # Function to format documents into a single context string
            def format_docs(docs):
                print(f'The retrieved documents are:')
                for idx,doc in enumerate(docs):
                    print(f'{idx} - Content: {doc.page_content[:50]}... - MetaData: {doc.metadata}')
                return "\n\n".join(doc.page_content for doc in docs)
            
            retrieved_docs = retriever.get_relevant_documents(query)
            context = format_docs(retrieved_docs)
            
            response = self.llm.chat(context={context},
                                    query={query},
                                    max_new_tokens=250)
            print('-')
            print(response)
        
    def __query_all(self, query, k=1, collection_names:List[str]=['Uk', 'Wales', 'Nothernireland', 'Scotland'], search_type='Hybrid'):
        """
        Performs a RAG query on multiple specified collections using the Saul LLM.
        
        Args:
            collection_name (str): The name of the collection in the database.
            query (str): The query to search for similar documents.
            k (int, optional): The number of documents to return. Defaults to 1.
        
        Returns:
            None
        
        Prints the similarity score and the content of the top k documents that match the query.
        """
        
        # Creating a WeaviateVectorStore for all counties one by one
        for collection_name in collection_names:
            #Validate existence of the collection itself first.
            # Validity = self.is_collection_empty(collection_name)
            Validity = False
            print(f'The Collection: {collection_name} is empty(0)/Not Empty(1): {Validity}')
            
            if not Validity:
                if search_type == 'Vector':
                    self.vector_db.vector_store = WeaviateVectorStore(
                        client=self.vector_db.client,
                        index_name=collection_name,
                        text_key="text",
                        embedding=self.vector_db.embeddings,
                    )
                
                    # Get the current WeaviateVectorStore
                    current_db = self.vector_db.vector_store
                    
                    # Create a retriever for the current database
                    retriever = current_db.as_retriever(
                        search_kwargs={"k": k})

                    # Function to format documents into a single context string
                    def format_docs(docs):
                        print(f'The retrieved documents are:')
                        for idx,doc in enumerate(docs):
                            print(f'{idx} - Content: {doc.page_content[:50]}... - MetaData: {doc.metadata}')
                        return "\n\n".join(doc.page_content for doc in docs)
                    
                    retrieved_docs = retriever.get_relevant_documents(query)
                    context = format_docs(retrieved_docs)
                    
                    response = self.llm.chat(context={context},
                                            query={query},
                                            max_new_tokens=250)
                    print(f'\n\nThe response is from the collection: {collection_name}')
                    print(response)
                    print('-')

                elif search_type == 'Hybrid':
                    current_collection = self.vector_db.client.collections.get(collection_name)
                    responses = current_collection.query.hybrid(query=query,
                                                                vector=self.vector_db.embeddings.embed_query(query),
                                                                limit=k)
                    Text_Docs = []
                    Text_Meta_Datas = []
                    
                    for o in responses.objects: #output docs of the hybrid search
                        Text_Docs.append(o.properties['text'])
                        Text_Meta_Datas.append({k: v for k, v in o.properties.items() if k != 'text'})

                    def format_docs(docs):
                        print(f'The retrieved documents are:')
                        for idx,(doc,meta) in enumerate(zip(docs,Text_Meta_Datas)):
                            print(f'{idx} - Content: {doc[:50]}... - MetaData: {meta}')
                        return "\n\n".join(doc for doc in docs)

                    concat_docs = format_docs(Text_Docs)

                    response = self.llm.chat(context={concat_docs},
                                            query={query},
                                            max_new_tokens=250)
                    print(f'\n\nThe response is from the collection: {collection_name}')
                    print(response)
                    print('-')
                    
    def is_collection_empty(self, collection_name: str) -> bool:
        current_client = self.vector_db.client.collections.get(collection_name)
        return len(list(current_client.iterator())) == 0

    def get_list_of_all_docs(self, collection_name:Union[str, List[str]]=None) -> None:
        """
        Function to get the list of all documents in the specified collection.
        
        Args:
            collection_name (Union[str, List[str]], optional): The name of the collection in the database. Defaults to None.
        
        Returns:
            None
        """
        if isinstance(collection_name, list):
            for collection in collection_name:
                is_empty = self.is_collection_empty(collection)
                if not is_empty:
                    self.get_list_of_all_docs(collection)

        elif isinstance(collection_name, str):
            collection_existence_validity = self.is_collection_empty(collection_name)
            if not collection_existence_validity:
                print(f'The collection {collection_name} has the following documents:')
                current_client = self.vector_db.client.collections.get(collection_name)
                for item in current_client.iterator():
                    for idxKey,Key in enumerate(item.properties.keys()):
                        print(f'{Key}:  {item.properties[Key]}')
                print('\n\n')

In [11]:
collection_names = ['Uk', 'Wales', 'NothernIreland', 'Scotland']
bot = RAG_Bot(collection_names=collection_names, text_splitter='SpaCy', embedding_model="SentenceTransformers")

Cluster_URL: https://gwqaighsqp6gtrk8kwmnmg.c0.us-west3.gcp.weaviate.cloud, Cluster_API: 2xKmTTGUawzXYItFrMR9JgTjY2oRkfzHFLWN
Creating 4 Weaviate Clusters - Option 3
Country.capitalize(): Uk
The collection: Uk already exists in the Weaviate Cluster
Country.capitalize(): Wales
The collection: Wales already exists in the Weaviate Cluster
Country.capitalize(): Nothernireland
The collection: NothernIreland already exists in the Weaviate Cluster
Country.capitalize(): Scotland
The collection: Scotland already exists in the Weaviate Cluster


In [None]:
# bot.vector_db.delete_all_collections()

In [None]:
# bot.add_text(collection_name='Wales', text='Wally', metadata={'name': 'saul'}) #change the text and meta data here with your text
# bot.add_text(collection_name='Wales', text='UK has rivers, not wales.', metadata={'name': 'saul'}) #change the text and meta data here with your text
# bot.add_text(collection_name='Wales', text='Wales has rivers, oceans, mountains.', metadata={'name': 'saul'}) #change the text and meta data here with your text

In [None]:
# bot.get_list_of_all_docs(collection_name='Wales')
# bot.get_list_of_all_docs(collection_name='Scotland')
# bot.get_list_of_all_docs(collection_name='Nothernireland')
# bot.get_list_of_all_docs(collection_name='Uk')

# Single Collection Queries

In [12]:
bot.query(query='Define labor laws and limitations within Scotland?', k=2, search_type='Hybrid') #or Vector with a Capital V #change the query here with your text

Collection_to_query_from: ['Scotland']
The Collection: Scotland is empty(0)/Not Empty(1): False
The retrieved documents are:
0 - Content: 22
Extent, commencement and short title
(1)
This A... - MetaData: {'year': '2023', 'legislation': 'UK Public General Acts', 'legislationType': 'May contain legislation that applies to Scotland', 'title': 'Employment Relations (Flexible Working) Act 2023.txt', 'country': 'Scotland'}
1 - Content: (3) These Regulations extend to England and Wales ... - MetaData: {'year': '2022', 'legislation': 'UK Draft Statutory Instruments', 'legislationType': 'May contain legislation that applies to Scotland', 'title': 'The Exclusivity Terms for Zero Hours Workers (Unenforceability and Redress) Regulations 2022\nSuperseded by 2022 No. 1145.txt', 'country': 'Scotland'}


The response is from the collection: Scotland
The user has provided a legal document related to Employment Relations (Flexible Working) Act 2023. The document outlines the extent, commencement, and sh

In [13]:
bot.query(query='Define labor laws and limitations within Wales?', k=2, search_type='Hybrid') #or Vector with a Capital V #change the query here with your text

Collection_to_query_from: ['Wales']
The Collection: Wales is empty(0)/Not Empty(1): False
The retrieved documents are:
0 - Content: (6)
Regulations under subsection (1)(b) may make c... - MetaData: {'title': 'Parliamentary Buildings (Restoration and Renewal) Act 2019.txt', 'legislation': 'UK Public General Acts', 'legislationType': 'May contain legislation that applies to Wales', 'year': '2019', 'country': 'Wales'}
1 - Content: (3)
Regulations under this Act are to be made by s... - MetaData: {'year': '2024', 'legislation': 'UK Public General Acts', 'title': 'Leasehold and Freehold Reform Act 2024.txt', 'legislationType': 'May contain legislation that applies to Wales', 'country': 'Wales'}


The response is from the collection: Wales
The relevant section of the document that addresses labor laws and limitations in Wales is section 7 of the Welsh Ministers (Transfer of Functions, Protection of Rights etc.) Act 2019 (Anawim 2019 27) (Cymru). This section provides the power to make regula

In [None]:
bot.query(query='what does wales have?', k=2, search_type='Vector') #change the query here with your text

In [None]:
bot.query(query='scotty from scotland uk/', k=1) #change the query here with your text

In [None]:
bot.query(query='scotty from scotland USA', k=1) #change the query here with your text