# Import All Required Libraries and Packages

In [None]:
import os
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

from RAG_v1 import *

# Initiate the Chatbot Class Object to:
## - Ingest Data into the Vector DB of the Respective Country

In [None]:
RAG_App_Object = RAG_Bot(['Uk', 'Wales', 'NothernIreland', 'Scotland'], #Collection Names as is
                         text_splitter='SpaCy',
                         embedding_model="SentenceTransformers") 

print(f'\nValidating the liveness of the collections:\n')
RAG_App_Object.vector_db.validate_collection()

# Emptying the Vector DB to Make Sure It is Empty Before Ingesting New Data In it

In [None]:
# RAG_App_Object.vector_db.delete_all_collections()

# Define Some Important Paths for the Ingestion Function/Code To Use

In [None]:
path_scraped_content_home = os.path.join('./Scrapper/Scraped_Content')
path_scraped_content_home

## All Required Functions for Ingestion

In [None]:
def read_file_text_content(file_path:str=None):
    if file_path is None:
        raise ValueError('File path cannot be None. Check the txt file path')
    else:
        with open(file_path, 'r') as f:
            return f.read()

# Cell that Iterates over the Scrapped Content and Adds the files to the Vector DB

In [None]:
log_file = 'processed_files.log'

# Load already processed files into a set
if os.path.exists(log_file):
    with open(log_file, 'r') as f:
        processed_files = set(f.read().splitlines())
else:
    processed_files = set()

with open(log_file, 'a') as f:  # Open the log file in append mode
    for idxCountry, Country in enumerate(os.listdir(path_scraped_content_home)):
        country = Country
        if Country == 'UK':
            country = 'Uk'
        Path_Country = os.path.join(path_scraped_content_home, Country)
        for idxLegislationType, LegislationType in enumerate(os.listdir(Path_Country)):
            Path_LegislationType = os.path.join(Path_Country, LegislationType)
            for idxLeg, Legislation in enumerate(os.listdir(Path_LegislationType)):
                Path_Legislation = os.path.join(Path_LegislationType, Legislation)
                for idxYear, Year in enumerate(os.listdir(Path_Legislation)):
                    Path_Year = os.path.join(Path_Legislation, Year)
                    for idxFile, File in enumerate(os.listdir(Path_Year)):
                        Path_File = os.path.join(Path_Year, File)

                        # Create a unique identifier for the file
                        file_id = f"{country}|{LegislationType}|{Legislation}|{Year}|{File}"

                        # Skip if the file has already been processed
                        if file_id in processed_files:
                            continue

                        # print(f'Country: {country}, LegType: {LegislationType}, Legislation: {Legislation}, Year: {Year}, Title_File: {File}')
                        try:
                            RAG_App_Object.add_text(
                                collection_name=country,
                                text=read_file_text_content(Path_File),
                                metadata={
                                    'Country': country,
                                    'LegislationType': LegislationType,
                                    'Legislation': Legislation,
                                    'Year': Year,
                                    'Title': File
                                }
                            )
                            # Log the successfully processed file
                            f.write(f'{file_id}\n')
                            f.flush()  # Ensure the log is written immediately
                        except Exception as e:
                            print(f"Error processing {file_id}: {str(e)}")
