In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import json
import re
import uuid

In [3]:
def pipeline_chunking_corpus(prepared_data_dir):
    # อ่านรายชื่อโรงงาน
    file_name = "plant_list.csv"
    plant_list_file = os.path.join(prepared_data_dir, file_name)
    if os.path.isfile(plant_list_file):
        plant_df = pd.read_csv(plant_list_file)
        # plant_df = plant_df[0:1]
        # plant_df = plant_df[1:2]
    else:
        print(f":: Failed ❌")
        print(f"File not found: {plant_list_file}")
        return  # หยุดการทำงานของฟังก์ชัน

    for _, plant_row in plant_df.iterrows():
        plant_tag = plant_row["PLANT_TAG"]
        plant_name = plant_row["PLANT_NAME"]

        # อ่านรายชื่อเครื่องจักรสำหรับแต่ละโรงงาน
        file_name = "machine_list.csv"
        machine_list_file = os.path.join(prepared_data_dir, plant_tag, file_name)
        if os.path.isfile(machine_list_file):
            machine_df = pd.read_csv(machine_list_file)
        else:
            print(f":: Failed ❌")
            print(f"File not found: {machine_list_file}")
            break

        for idx, machine_row in machine_df.iterrows():
            machine_tag = machine_row["MACHINE_TAG"]
            machine_name = machine_row["MACHINE_NAME"]
            chunks_content_list = []
            
            print("\n" + "=" * 100)
            print(
                f"#{idx+1} Processing data for {plant_name} (TAG: {plant_tag}) - {machine_name} (TAG: {machine_tag})"
            )
            print("=" * 100)

            # ประมวลผลข้อมูลข้อความ corpus
            file_name = f"{plant_tag}_{machine_tag}_corpus.txt"
            print(f"\n>> Process: Corpus text dataa - File: {file_name}")
            corpus_file = os.path.join(prepared_data_dir, plant_tag, machine_tag, file_name)
            if os.path.isfile(corpus_file):
                with open(corpus_file, "r") as file:
                    corpus_content = file.read()

                print(":: Complete ✔️")
                # print(corpus_text)
            else:
                print(f":: Failed ❌")
                print(f"File not found: {corpus_file}")
                corpus_content = ""
                
            # แบ่งเนื้อหาเป็นชิ้นๆ
            print(f"\n>> Chunks content")
            # bullet_list_pattern = r"\* (.*?)\n"
            bullet_list_pattern = r"\* (.+?)(?=\n|\Z)"
            chunks_content = re.findall(bullet_list_pattern, corpus_content)
            chunks_content_list.extend(chunks_content)
            
            doc_id = f"{plant_tag}_{machine_tag}"
            namespace = uuid.NAMESPACE_DNS

            # สร้าง UUID เวอร์ชัน 3
            uuid3 = uuid.uuid3(namespace, doc_id)
            metadata = [{
                "doc_id": doc_id,
                "original_uuid": str(uuid3),
                "corpus_source": corpus_file,
                "chunks": [
                    {
                        "chunk_id": f"{doc_id}_chunk_{idx}",
                        "original_index": idx,
                        "content": content
                    }
                    for idx, content in enumerate(chunks_content_list)
                ]
            }]
            print(f":: Complete ✔️") 
            
            # ประมวลผลการบันทึกไปยัง prepared_data_dir
            file_name = f"{plant_tag}_{machine_tag}_chunks.json"

            # ตรวจสอบและสร้าง directory หากยังไม่มีอยู่
            save_to_dir = os.path.join(prepared_data_dir, plant_tag, machine_tag)
            os.makedirs(save_to_dir, exist_ok=True)

            # สร้าง path ของไฟล์สุดท้าย
            chunks_content_full_file_path = os.path.join(save_to_dir, file_name)
            
            # บันทึก JSON ที่สร้างได้ลงไฟล์
            with open(chunks_content_full_file_path, "w") as f:
                json.dump(metadata, f, ensure_ascii=False, indent=2)

            print("\n>> JSON file created successfully.")
            print(f":: File name: {file_name}")
            print(f":: Data successfully saved to: {chunks_content_full_file_path}")
            print(f":: Complete ✔️")

            print("=" * 100)
            print("Complete all Process")
            print("=" * 100)


In [4]:
# Set the project root directory.
ROOT_DIRECTORY = "D:\Data_sci_internship\Exploring Generative AI for Predictive Maintenance Applications"

# Specify the subpath project.
PROJECT_DIRECTORY = "predictive-maintenance-chatbot"
DATA_ROOT_DIRECTORY = "data"
RAW_DATA_DIRECTORY = "raw_data"
PREPARED_DATA_DIRECTORY = "prepared_data"

prepared_data_dir = os.path.join(
    ROOT_DIRECTORY, PROJECT_DIRECTORY, DATA_ROOT_DIRECTORY, PREPARED_DATA_DIRECTORY
)

# Call the pipeline_numeric2text function
pipeline_chunking_corpus(prepared_data_dir)


#1 Processing data for Natural Gas Processing Plant (TAG: PLANT_01) - Sale Gas Compressor (TAG: COMP_SG01)

>> Process: Corpus text dataa - File: PLANT_01_COMP_SG01_corpus.txt
:: Complete ✔️

>> Chunks content
:: Complete ✔️

>> JSON file created successfully.
:: File name: PLANT_01_COMP_SG01_chunks.json
:: Data successfully saved to: D:\Data_sci_internship\Exploring Generative AI for Predictive Maintenance Applications\predictive-maintenance-chatbot\data\prepared_data\PLANT_01\COMP_SG01\PLANT_01_COMP_SG01_chunks.json
:: Complete ✔️
Complete all Process

#1 Processing data for Everflow Utility Plant (TAG: PLANT_02) - Dual Fuel Generator A (TAG: GEN_DF_01)

>> Process: Corpus text dataa - File: PLANT_02_GEN_DF_01_corpus.txt
:: Complete ✔️

>> Chunks content
:: Complete ✔️

>> JSON file created successfully.
:: File name: PLANT_02_GEN_DF_01_chunks.json
:: Data successfully saved to: D:\Data_sci_internship\Exploring Generative AI for Predictive Maintenance Applications\predictive-mainten