In [None]:
import os
import shutil
import fitz  # PyMuPDF for PDF processing
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from docx import Document as WordDocument

CHROMA_PATH = "chroma"  # Path to Chroma database
DATA_PATH = "SOPS"  # Path to SOPS directory
MANUAL_FILE = "summit-oa-user-guide-en.pdf"  # Full path to the manual
OUTPUT_FILE = "Generated_SOP_for_Gas_Chromatography.docx"  # Output path for SOP document
API_KEY = "private key"
# Step 1: Load documents
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

# Step 2: Split documents into chunks
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80, length_function=len)
    return text_splitter.split_documents(documents)

# Step 3: Create or update Chroma database
def add_to_chroma(chunks: list[Document]):
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())
    db.add_documents(chunks)
    db.persist()

# Embedding function
def get_embedding_function():
    return OpenAIEmbeddings(openai_api_key=API_KEY)

# Step 4: Generate a single SOP section
def generate_sop_section(query_text: str, manual_path: str, section: str):
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())

    # Retrieve relevant chunks
    results = db.similarity_search_with_score(query_text, k=3)
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _ in results])

    # Extract manual content
    manual_text = extract_manual_content(manual_path, max_pages=10)
    context_text += f"\n\nManual Content:\n{manual_text[:2000]}"  # Include snippet of manual content

    # Generate the section
    prompt = f"""
    Use the following context to create the '{section}' section for the SOP:
    Context:
    {context_text}

    Section: {section}
    """
    model = ChatOpenAI(model="gpt-4", openai_api_key=API_KEY)
    response = model.invoke(prompt)

    return response.content if response else "Error generating this section"

# Step 5: Extract content from PDF (without images)
def extract_manual_content(pdf_path, max_pages=10):
    doc = fitz.open(pdf_path)
    text = []

    for page_num in range(min(len(doc), max_pages)):
        page = doc[page_num]
        text.append(page.get_text())

    return "\n".join(text)

# Step 6: Save SOP to Word (without images)
def save_sop_to_word(sop_text: str, output_path: str):
    output_dir = os.path.dirname(output_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    doc = WordDocument()
    doc.add_heading("Standard Operating Procedure (SOP)", level=1)

    # Split SOP text into sections
    sections = sop_text.split("\n\n")
    for section in sections:
        if ":" in section:
            header, content = section.split(":", 1)
            doc.add_heading(header.strip(), level=2)
            doc.add_paragraph(content.strip())
        else:
            doc.add_paragraph(section.strip())

    doc.save(output_path)
    print(f"SOP successfully saved to {output_path}")

# Step 7: Generate the full SOP
def generate_full_sop(query_text: str, manual_path: str, output_path: str):
    sop_text = ""
    sop_sections = [
        "Scope",
        "Background",
        "Safety",
        "Materials Required",
        "Standards and Controls",
        "Calibration",
        "Procedures",
        "Sampling",
        "Calculations",
        "Uncertainty of Measurement",
        "Limitations",
        "Documentation",
        "References",
    ]

    for section in sop_sections:
        print(f"Generating section: {section}")
        section_text = generate_sop_section(query_text, manual_path, section)
        sop_text += f"\n\n{section}:\n{section_text}"

    save_sop_to_word(sop_text, output_path)
    print("SOP generation completed.")

# Main function
def main():
    print("Step 1: Loading documents...")
    documents = load_documents()

    print("Step 2: Splitting documents into chunks...")
    chunks = split_documents(documents)

    print("Step 3: Adding chunks to Chroma database...")
    add_to_chroma(chunks)

    print("Step 4: Generating SOP for an instrument...")
    instrument_query = "Generate SOP for FTIR Spectrometer for the brand thermo Fisher for polymer"
    generate_full_sop(instrument_query, MANUAL_FILE, OUTPUT_FILE)

if __name__ == "__main__":
    main()

In [16]:
import os
import shutil
import fitz  # PyMuPDF for PDF processing
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from docx import Document as WordDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

CHROMA_PATH = "chroma"  # Path to Chroma database
DATA_PATH = "SOPS"  # Path to SOPS directory
HIDDEN_SOP = "GCMS-6.pdf"  # Name of the hidden SOP file
MANUAL_FILE = "c10g-e080.pdf"  # Manual file
OUTPUT_FILE = "Generated_SOP_for_Gas_Chromatography.docx"  # Output file
API_KEY = "private key"

# Step 1: Load documents
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

# Step 2: Split documents into chunks
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80, length_function=len)
    return text_splitter.split_documents(documents)

# Step 3: Create or update Chroma database
def add_to_chroma(chunks: list[Document]):
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())
    db.add_documents(chunks)
    db.persist()

# Embedding function
def get_embedding_function():
    return OpenAIEmbeddings(openai_api_key=API_KEY)

# Step 4: Generate a single SOP section
def generate_sop_section(query_text: str, manual_file: str, section: str):
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())

    # Retrieve relevant chunks
    results = db.similarity_search_with_score(query_text, k=3)
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _ in results])

    # Extract manual content
    manual_text = extract_manual_content(manual_file, max_pages=10)
    context_text += f"\n\nManual Content:\n{manual_text[:2000]}"  # Include snippet of manual content

    # Generate the section
    prompt = f"""
     # Updated prompt
    prompt = f""
    Use the following context to create a detailed Standard Operating Procedure (SOP). 
    Include comprehensive details for each section, including examples, explanations, and visuals. Don't summarize the details and keep the ideas unique and not redundant. 
        Ensure the content follows these guidelines:
        - Use formal and technical language typical of SOPs.
        - Align the structure and terminology with the provided context.
        - Maintain clarity and conciseness.
        Context:
    Structure:
    1. Scope: Defines the purpose and applicability of the procedure, often focusing on specific uses, such as analyzing air samples, tobacco content, or hydrocarbons in automotive exhaust.
    2. Background: Provides context or rationale for the method being standardized, often referencing related research or industry needs.
    3. Safety: Highlights safety precautions, including the use of personal protective equipment (PPE), handling hazardous chemicals, and waste disposal protocols.
    4. Materials and Equipment: Lists all necessary reagents, solvents, instrumentation, and ancillary tools required for conducting the procedure.
    5. Standards and Calibration: Details the required reference materials, standards for quality control, and steps for instrument calibration to ensure accuracy.
    6. Procedure: Offers step-by-step instructions for conducting the analysis, including sample preparation, injection, instrument operation, and troubleshooting.
    7. Quality Control and Data Validation: Focuses on performance checks, validation criteria, and measures to ensure data reliability.
    8. Data Analysis and Reporting: Describes how results should be processed, analyzed, and documented, often including specific formats or tools.
    9. Limitations and Interferences: Identifies constraints of the method, such as compounds that may not be detected or interferences that could affect results.
    10. Health and Environmental Safety: Emphasizes precautions to protect operators and minimize environmental impact during the procedure.
    11. Documentation and References: Specifies record-keeping requirements and cites foundational literature or external standards that the SOP builds upon.
    12. Special Precautions and Handling: Covers specific handling instructions for sens
    Use the following context to create the '{section}' section for the SOP:
    Context:
    {context_text}

    Section: {section}
    """
    model = ChatOpenAI(model="gpt-4", openai_api_key=API_KEY)
    response = model.invoke(prompt)

    return response.content if response else "Error generating this section"

# Step 5: Extract content from PDF (without images)
def extract_manual_content(pdf_path, max_pages=10):
    doc = fitz.open(pdf_path)
    text = []

    for page_num in range(min(len(doc), max_pages)):
        page = doc[page_num]
        text.append(page.get_text())

    return "\n".join(text)

# Step 6: Save SOP to Word (without images)
def save_sop_to_word(sop_text: str, output_file: str):
    doc = WordDocument()
    doc.add_heading("Standard Operating Procedure (SOP)", level=1)

    # Split SOP text into sections
    sections = sop_text.split("\n\n")
    for section in sections:
        if ":" in section:
            header, content = section.split(":", 1)
            doc.add_heading(header.strip(), level=2)
            doc.add_paragraph(content.strip())
        else:
            doc.add_paragraph(section.strip())

    doc.save(output_file)
    print(f"SOP successfully saved to {output_file}")

# Step 7: Generate the full SOP
def generate_full_sop(query_text: str, manual_file: str, output_file: str):
    sop_text = ""
    sop_sections = [
        "Scope",
        "Background",
        "Safety",
        "Materials Required",
        "Standards and Controls",
        "Calibration",
        "Procedures",
        "Sampling",
        "Calculations",
        "Uncertainty of Measurement",
        "Limitations",
        "Documentation",
        "References",
    ]

    for section in sop_sections:
        print(f"Generating section: {section}")
        section_text = generate_sop_section(query_text, manual_file, section)
        sop_text += f"\n\n{section}:\n{section_text}"

    save_sop_to_word(sop_text, output_file)
    print("SOP generation completed.")

# Step 8: Hide one SOP
def hide_sop(sop_name: str, source_dir: str, hidden_dir: str):
    sop_path = os.path.join(source_dir, sop_name)
    hidden_path = os.path.join(hidden_dir, sop_name)

    if not os.path.exists(hidden_dir):
        os.makedirs(hidden_dir)

    if os.path.exists(sop_path):
        try:
            shutil.copy(sop_path, hidden_path)
            os.remove(sop_path)
            print(f"SOP '{sop_name}' has been hidden.")
        except PermissionError as e:
            print(f"Permission error: {e}. Ensure the file is not open in another program.")
        except Exception as e:
            print(f"An unexpected error occurred while hiding SOP: {e}")
    else:
        print(f"SOP '{sop_name}' not found in '{source_dir}'.")

# Step 9: Extract and Compare File Content
def extract_text_from_docx(docx_path):
    doc = WordDocument(docx_path)
    text = []
    for paragraph in doc.paragraphs:
        text.append(paragraph.text)
    return "\n".join(text)

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = []
    for page in doc:
        text.append(page.get_text())
    return "\n".join(text)

def calculate_similarity(file1_path: str, file2_path: str):
    if file1_path.endswith('.docx'):
        file1_content = extract_text_from_docx(file1_path)
    elif file1_path.endswith('.pdf'):
        file1_content = extract_text_from_pdf(file1_path)
    else:
        raise ValueError(f"Unsupported file format: {file1_path}")

    if file2_path.endswith('.docx'):
        file2_content = extract_text_from_docx(file2_path)
    elif file2_path.endswith('.pdf'):
        file2_content = extract_text_from_pdf(file2_path)
    else:
        raise ValueError(f"Unsupported file format: {file2_path}")

    vectorizer = TfidfVectorizer().fit_transform([file1_content, file2_content])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity(vectors)
    return similarity[0, 1]

# Main function
def main():
    hide_sop(HIDDEN_SOP, DATA_PATH, "hidden_sops")

    print("Step 1: Loading documents...")
    documents = load_documents()

    print("Step 2: Splitting documents into chunks...")
    chunks = split_documents(documents)

    print("Step 3: Adding chunks to Chroma database...")
    add_to_chroma(chunks)

    print("Step 4: Generating SOP for an instrument...")
    instrument_query = "Write the SOP for Shimadzu Gas chromatography mass spectrometer QP2010 SE Standard Operating Procedure"
    generate_full_sop(instrument_query, MANUAL_FILE, OUTPUT_FILE)

    hidden_sop_path = os.path.join("hidden_sops", HIDDEN_SOP)
    similarity_score = calculate_similarity(OUTPUT_FILE, hidden_sop_path)
    print(f"Similarity score between generated and actual SOP: {similarity_score:.2f}")

if __name__ == "__main__":
    main()


SOP 'GCMS-6.pdf' not found in 'SOPS'.
Step 1: Loading documents...


Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 45 0 (offset 0)


Step 2: Splitting documents into chunks...
Step 3: Adding chunks to Chroma database...
Step 4: Generating SOP for an instrument...
Generating section: Scope
Generating section: Background
Generating section: Safety
Generating section: Materials Required
Generating section: Standards and Controls
Generating section: Calibration
Generating section: Procedures
Generating section: Sampling
Generating section: Calculations
Generating section: Uncertainty of Measurement
Generating section: Limitations
Generating section: Documentation
Generating section: References
SOP successfully saved to Generated_SOP_for_Gas_Chromatography.docx
SOP generation completed.
Similarity score between generated and actual SOP: 0.80


In [5]:
import os
import shutil
import fitz  # PyMuPDF for PDF processing
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from docx import Document as WordDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

CHROMA_PATH = "chroma"  # Path to Chroma database
DATA_PATH = "SOPS"  # Path to SOPS directory
HIDDEN_SOP = "GCMS-6.pdf"  # Name of the hidden SOP file
MANUAL_FILE = "c10g-e080.pdf"  # Manual file
OUTPUT_FILE = "Generated_SOP_for_Gas_Chromatography.docx"  # Output file
API_KEY = "private key"
# Step 1: Load documents
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

# Step 2: Split documents into chunks
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80, length_function=len)
    return text_splitter.split_documents(documents)

# Step 3: Create or update Chroma database
def add_to_chroma(chunks: list[Document]):
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())
    db.add_documents(chunks)
    db.persist()

# Embedding function
def get_embedding_function():
    return OpenAIEmbeddings(openai_api_key=API_KEY)

# Step 4: Generate a single SOP section
def generate_sop_section(query_text: str, manual_file: str, section: str):
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())

    # Retrieve relevant chunks
    results = db.similarity_search_with_score(query_text, k=3)
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _ in results])

    # Extract manual content
    manual_text = extract_manual_content(manual_file, max_pages=10)
    context_text += f"\n\nManual Content:\n{manual_text[:2000]}"  # Include snippet of manual content

    # Generate the section
    prompt = f"""
     # Updated prompt
    prompt = f""
    Use the following context to create a detailed Standard Operating Procedure (SOP). 
    Include comprehensive details for each section, including examples, explanations, and visuals. Don't summarize the details and keep the ideas unique and not redundant. 
        Ensure the content follows these guidelines:
        - Use formal and technical language typical of SOPs.
        - Align the structure and terminology with the provided context.
        - Maintain clarity and conciseness.
        Context:
    Structure:
    1. Scope: Defines the purpose and applicability of the procedure, often focusing on specific uses, such as analyzing air samples, tobacco content, or hydrocarbons in automotive exhaust.
    2. Background: Provides context or rationale for the method being standardized, often referencing related research or industry needs.
    3. Safety: Highlights safety precautions, including the use of personal protective equipment (PPE), handling hazardous chemicals, and waste disposal protocols.
    4. Materials and Equipment: Lists all necessary reagents, solvents, instrumentation, and ancillary tools required for conducting the procedure.
    5. Standards and Calibration: Details the required reference materials, standards for quality control, and steps for instrument calibration to ensure accuracy.
    6. Procedure: Offers step-by-step instructions for conducting the analysis, including sample preparation, injection, instrument operation, and troubleshooting.
    7. Quality Control and Data Validation: Focuses on performance checks, validation criteria, and measures to ensure data reliability.
    8. Data Analysis and Reporting: Describes how results should be processed, analyzed, and documented, often including specific formats or tools.
    9. Limitations and Interferences: Identifies constraints of the method, such as compounds that may not be detected or interferences that could affect results.
    10. Health and Environmental Safety: Emphasizes precautions to protect operators and minimize environmental impact during the procedure.
    11. Documentation and References: Specifies record-keeping requirements and cites foundational literature or external standards that the SOP builds upon.
    12. Special Precautions and Handling: Covers specific handling instructions for sens
    Use the following context to create the '{section}' section for the SOP:
    Context:
    {context_text}

    Section: {section}
    """
    model = ChatOpenAI(model="gpt-4", openai_api_key=API_KEY)
    response = model.invoke(prompt)

    return response.content if response else "Error generating this section"

# Step 5: Extract content from PDF (without images)
def extract_manual_content(pdf_path, max_pages=10):
    doc = fitz.open(pdf_path)
    text = []

    for page_num in range(min(len(doc), max_pages)):
        page = doc[page_num]
        text.append(page.get_text())

    return "\n".join(text)

# Step 6: Save SOP to Word (without images)
def save_sop_to_word(sop_text: str, output_file: str):
    doc = WordDocument()
    doc.add_heading("Standard Operating Procedure (SOP)", level=1)

    # Split SOP text into sections
    sections = sop_text.split("\n\n")
    for section in sections:
        if ":" in section:
            header, content = section.split(":", 1)
            doc.add_heading(header.strip(), level=2)
            doc.add_paragraph(content.strip())
        else:
            doc.add_paragraph(section.strip())

    doc.save(output_file)
    print(f"SOP successfully saved to {output_file}")

# Step 7: Generate the full SOP
def generate_full_sop(query_text: str, manual_file: str, output_file: str):
    sop_text = ""
    sop_sections = [
        "Scope",
        "Background",
        "Safety",
        "Materials Required",
        "Standards and Controls",
        "Calibration",
        "Procedures",
        "Sampling",
        "Calculations",
        "Uncertainty of Measurement",
        "Limitations",
        "Documentation",
        "References",
    ]

    for section in sop_sections:
        print(f"Generating section: {section}")
        section_text = generate_sop_section(query_text, manual_file, section)
        sop_text += f"\n\n{section}:\n{section_text}"

    save_sop_to_word(sop_text, output_file)
    print("SOP generation completed.")

# Step 8: Hide one SOP
def hide_sop(sop_name: str, source_dir: str, hidden_dir: str):
    sop_path = os.path.join(source_dir, sop_name)
    hidden_path = os.path.join(hidden_dir, sop_name)

    if not os.path.exists(hidden_dir):
        os.makedirs(hidden_dir)

    if os.path.exists(sop_path):
        try:
            shutil.copy(sop_path, hidden_path)
            os.remove(sop_path)
            print(f"SOP '{sop_name}' has been hidden.")
        except PermissionError as e:
            print(f"Permission error: {e}. Ensure the file is not open in another program.")
        except Exception as e:
            print(f"An unexpected error occurred while hiding SOP: {e}")
    else:
        print(f"SOP '{sop_name}' not found in '{source_dir}'.")

# Step 9: Extract and Compare File Content
def extract_text_from_docx(docx_path):
    doc = WordDocument(docx_path)
    text = []
    for paragraph in doc.paragraphs:
        text.append(paragraph.text)
    return "\n".join(text)

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = []
    for page in doc:
        text.append(page.get_text())
    return "\n".join(text)

def calculate_similarity(file1_path: str, file2_path: str):
    if file1_path.endswith('.docx'):
        file1_content = extract_text_from_docx(file1_path)
    elif file1_path.endswith('.pdf'):
        file1_content = extract_text_from_pdf(file1_path)
    else:
        raise ValueError(f"Unsupported file format: {file1_path}")

    if file2_path.endswith('.docx'):
        file2_content = extract_text_from_docx(file2_path)
    elif file2_path.endswith('.pdf'):
        file2_content = extract_text_from_pdf(file2_path)
    else:
        raise ValueError(f"Unsupported file format: {file2_path}")

    vectorizer = TfidfVectorizer().fit_transform([file1_content, file2_content])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity(vectors)
    return similarity[0, 1]

# Main function
def main():
    hide_sop(HIDDEN_SOP, DATA_PATH, "hidden_sops")

    print("Step 1: Loading documents...")
    documents = load_documents()

    print("Step 2: Splitting documents into chunks...")
    chunks = split_documents(documents)

    print("Step 3: Adding chunks to Chroma database...")
    add_to_chroma(chunks)

    print("Step 4: Generating SOP for an instrument...")
    instrument_query = "Write the SOP for Shimadzu Gas chromatography mass spectrometer QP2010 SE Standard Operating Procedure"
    generate_full_sop(instrument_query, MANUAL_FILE, OUTPUT_FILE)

    hidden_sop_path = os.path.join("hidden_sops", HIDDEN_SOP)
    similarity_score = calculate_similarity(OUTPUT_FILE, hidden_sop_path)
    print(f"Similarity score between generated and actual SOP: {similarity_score:.2f}")

if __name__ == "__main__":
    main()




SOP 'GCMS-6.pdf' not found in 'SOPS'.
Step 1: Loading documents...


Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
invalid pdf header: b'\r\n%PD'
incorrect startxref pointer(1)
parsing for Object Streams
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 45 0 (offset 0)


Step 2: Splitting documents into chunks...
Step 3: Adding chunks to Chroma database...


  return OpenAIEmbeddings(openai_api_key=API_KEY)
  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())
  db.persist()


Step 4: Generating SOP for an instrument...
Generating section: Scope


  model = ChatOpenAI(model="gpt-4", openai_api_key=API_KEY)


Generating section: Background
Generating section: Safety
Generating section: Materials Required
Generating section: Standards and Controls
Generating section: Calibration
Generating section: Procedures
Generating section: Sampling
Generating section: Calculations
Generating section: Uncertainty of Measurement
Generating section: Limitations
Generating section: Documentation
Generating section: References
SOP successfully saved to Generated_SOP_for_Gas_Chromatography.docx
SOP generation completed.
Similarity score between generated and actual SOP: 0.83


In [6]:
import os
import shutil
import fitz  # PyMuPDF for PDF processing
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from docx import Document as WordDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

CHROMA_PATH = "chroma"  # Path to Chroma database
DATA_PATH = "SOPS"  # Path to SOPS directory
HIDDEN_SOP = "SOP_FTIR.pdf"  # Name of the hidden SOP file
MANUAL_FILE = ""  # Manual file
OUTPUT_FILE = "Genrated_SOP_FTIR.docx"  # Output file
API_KEY = "private key"

# Step 1: Load documents
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

# Step 2: Split documents into chunks
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80, length_function=len)
    return text_splitter.split_documents(documents)

# Step 3: Create or update Chroma database
def add_to_chroma(chunks: list[Document]):
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())
    db.add_documents(chunks)
    db.persist()

# Embedding function
def get_embedding_function():
    return OpenAIEmbeddings(openai_api_key=API_KEY)

# Step 4: Generate a single SOP section
def generate_sop_section(query_text: str, manual_file: str, section: str):
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())

    # Retrieve relevant chunks
    results = db.similarity_search_with_score(query_text, k=3)
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _ in results])

    # Extract manual content
    manual_text = extract_manual_content(manual_file, max_pages=10)
    context_text += f"\n\nManual Content:\n{manual_text[:2000]}"  # Include snippet of manual content

    # Generate the section
    prompt = f"""
     # Updated prompt
    prompt = f""
    Use the following context to create a detailed Standard Operating Procedure (SOP). 
    Include comprehensive details for each section, including examples, explanations, and visuals. Don't summarize the details and keep the ideas unique and not redundant. 
        Ensure the content follows these guidelines:
        - Use formal and technical language typical of SOPs.
        - Align the structure and terminology with the provided context.
        - Maintain clarity and conciseness.
        Context:
    Structure:
    1. Scope: Defines the purpose and applicability of the procedure, often focusing on specific uses, such as analyzing air samples, tobacco content, or hydrocarbons in automotive exhaust.
    2. Background: Provides context or rationale for the method being standardized, often referencing related research or industry needs.
    3. Safety: Highlights safety precautions, including the use of personal protective equipment (PPE), handling hazardous chemicals, and waste disposal protocols.
    4. Materials and Equipment: Lists all necessary reagents, solvents, instrumentation, and ancillary tools required for conducting the procedure.
    5. Standards and Calibration: Details the required reference materials, standards for quality control, and steps for instrument calibration to ensure accuracy.
    6. Procedure: Offers step-by-step instructions for conducting the analysis, including sample preparation, injection, instrument operation, and troubleshooting.
    7. Quality Control and Data Validation: Focuses on performance checks, validation criteria, and measures to ensure data reliability.
    8. Data Analysis and Reporting: Describes how results should be processed, analyzed, and documented, often including specific formats or tools.
    9. Limitations and Interferences: Identifies constraints of the method, such as compounds that may not be detected or interferences that could affect results.
    10. Health and Environmental Safety: Emphasizes precautions to protect operators and minimize environmental impact during the procedure.
    11. Documentation and References: Specifies record-keeping requirements and cites foundational literature or external standards that the SOP builds upon.
    12. Special Precautions and Handling: Covers specific handling instructions for sens
    Use the following context to create the '{section}' section for the SOP:
    Context:
    {context_text}

    Section: {section}
    """
    model = ChatOpenAI(model="gpt-4", openai_api_key=API_KEY)
    response = model.invoke(prompt)

    return response.content if response else "Error generating this section"

# Step 5: Extract content from PDF (without images)
def extract_manual_content(pdf_path, max_pages=10):
    doc = fitz.open(pdf_path)
    text = []

    for page_num in range(min(len(doc), max_pages)):
        page = doc[page_num]
        text.append(page.get_text())

    return "\n".join(text)

# Step 6: Save SOP to Word (without images)
def save_sop_to_word(sop_text: str, output_file: str):
    doc = WordDocument()
    doc.add_heading("Standard Operating Procedure (SOP)", level=1)

    # Split SOP text into sections
    sections = sop_text.split("\n\n")
    for section in sections:
        if ":" in section:
            header, content = section.split(":", 1)
            doc.add_heading(header.strip(), level=2)
            doc.add_paragraph(content.strip())
        else:
            doc.add_paragraph(section.strip())

    doc.save(output_file)
    print(f"SOP successfully saved to {output_file}")

# Step 7: Generate the full SOP
def generate_full_sop(query_text: str, manual_file: str, output_file: str):
    sop_text = ""
    sop_sections = [
        "Scope",
        "Background",
        "Safety",
        "Materials Required",
        "Standards and Controls",
        "Calibration",
        "Procedures",
        "Sampling",
        "Calculations",
        "Uncertainty of Measurement",
        "Limitations",
        "Documentation",
        "References",
    ]

    for section in sop_sections:
        print(f"Generating section: {section}")
        section_text = generate_sop_section(query_text, manual_file, section)
        sop_text += f"\n\n{section}:\n{section_text}"

    save_sop_to_word(sop_text, output_file)
    print("SOP generation completed.")

# Step 8: Hide one SOP
def hide_sop(sop_name: str, source_dir: str, hidden_dir: str):
    sop_path = os.path.join(source_dir, sop_name)
    hidden_path = os.path.join(hidden_dir, sop_name)

    if not os.path.exists(hidden_dir):
        os.makedirs(hidden_dir)

    if os.path.exists(sop_path):
        try:
            shutil.copy(sop_path, hidden_path)
            os.remove(sop_path)
            print(f"SOP '{sop_name}' has been hidden.")
        except PermissionError as e:
            print(f"Permission error: {e}. Ensure the file is not open in another program.")
        except Exception as e:
            print(f"An unexpected error occurred while hiding SOP: {e}")
    else:
        print(f"SOP '{sop_name}' not found in '{source_dir}'.")

# Step 9: Extract and Compare File Content
def extract_text_from_docx(docx_path):
    doc = WordDocument(docx_path)
    text = []
    for paragraph in doc.paragraphs:
        text.append(paragraph.text)
    return "\n".join(text)

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = []
    for page in doc:
        text.append(page.get_text())
    return "\n".join(text)

def calculate_similarity(file1_path: str, file2_path: str):
    if file1_path.endswith('.docx'):
        file1_content = extract_text_from_docx(file1_path)
    elif file1_path.endswith('.pdf'):
        file1_content = extract_text_from_pdf(file1_path)
    else:
        raise ValueError(f"Unsupported file format: {file1_path}")

    if file2_path.endswith('.docx'):
        file2_content = extract_text_from_docx(file2_path)
    elif file2_path.endswith('.pdf'):
        file2_content = extract_text_from_pdf(file2_path)
    else:
        raise ValueError(f"Unsupported file format: {file2_path}")

    vectorizer = TfidfVectorizer().fit_transform([file1_content, file2_content])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity(vectors)
    return similarity[0, 1]

# Main function
def main():
    hide_sop(HIDDEN_SOP, DATA_PATH, "hidden_sops")

    print("Step 1: Loading documents...")
    documents = load_documents()

    print("Step 2: Splitting documents into chunks...")
    chunks = split_documents(documents)

    print("Step 3: Adding chunks to Chroma database...")
    add_to_chroma(chunks)

    print("Step 4: Generating SOP for an instrument...")
    instrument_query = "Write the SOP for the Perkin Elmer model 1600 FTIR."
    generate_full_sop(instrument_query, MANUAL_FILE, OUTPUT_FILE)

    hidden_sop_path = os.path.join("hidden_sops", HIDDEN_SOP)
    similarity_score = calculate_similarity(OUTPUT_FILE, hidden_sop_path)
    print(f"Similarity score between generated and actual SOP: {similarity_score:.2f}")

if __name__ == "__main__":
    main()




SOP 'SOP_FTIR.pdf' has been hidden.
Step 1: Loading documents...


Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
invalid pdf header: b'\r\n%PD'
incorrect startxref pointer(1)
parsing for Object Streams
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 45 0 (offset 0)


Step 2: Splitting documents into chunks...
Step 3: Adding chunks to Chroma database...
Step 4: Generating SOP for an instrument...
Generating section: Scope
Generating section: Background
Generating section: Safety
Generating section: Materials Required
Generating section: Standards and Controls
Generating section: Calibration
Generating section: Procedures
Generating section: Sampling
Generating section: Calculations
Generating section: Uncertainty of Measurement
Generating section: Limitations
Generating section: Documentation
Generating section: References
SOP successfully saved to Genrated_SOP_FTIR.docx
SOP generation completed.
Similarity score between generated and actual SOP: 0.79


In [11]:
import os
import shutil
import fitz  # PyMuPDF for PDF processing
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from docx import Document as WordDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

CHROMA_PATH = "chroma"  # Path to Chroma database
DATA_PATH = "SOPS"  # Path to SOPS directory
HIDDEN_SOP = "ICPMS_SOPs_Liquids.pdf"  # Name of the hidden SOP file
MANUAL_FILE = "8510230100_700SeriesICP_UserManual.pdf"  # Manual file
OUTPUT_FILE = "Generated_SOP_for_ICP.docx"  # Output file
API_KEY = "private key"

# Step 1: Load documents
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

# Step 2: Split documents into chunks
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80, length_function=len)
    return text_splitter.split_documents(documents)

# Step 3: Create or update Chroma database
def add_to_chroma(chunks: list[Document]):
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())
    db.add_documents(chunks)
    db.persist()

# Embedding function
def get_embedding_function():
    return OpenAIEmbeddings(openai_api_key=API_KEY)

# Step 4: Generate a single SOP section
def generate_sop_section(query_text: str, manual_file: str, section: str):
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())

    # Retrieve relevant chunks
    results = db.similarity_search_with_score(query_text, k=3)
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _ in results])

    # Extract manual content
    manual_text = extract_manual_content(manual_file, max_pages=10)
    context_text += f"\n\nManual Content:\n{manual_text[:2000]}"  # Include snippet of manual content

    # Generate the section
    prompt = f"""
     # Updated prompt
    prompt = f""
    Use the following context to create a detailed Standard Operating Procedure (SOP). 
    Include comprehensive details for each section, including examples, explanations, and visuals. Don't summarize the details and keep the ideas unique and not redundant. 
        Ensure the content follows these guidelines:
        - Use formal and technical language typical of SOPs.
        - Align the structure and terminology with the provided context.
        - Maintain clarity and conciseness.
        Context:
    Structure:
    1. Scope: Defines the purpose and applicability of the procedure, often focusing on specific uses, such as analyzing air samples, tobacco content, or hydrocarbons in automotive exhaust.
    2. Background: Provides context or rationale for the method being standardized, often referencing related research or industry needs.
    3. Safety: Highlights safety precautions, including the use of personal protective equipment (PPE), handling hazardous chemicals, and waste disposal protocols.
    4. Materials and Equipment: Lists all necessary reagents, solvents, instrumentation, and ancillary tools required for conducting the procedure.
    5. Standards and Calibration: Details the required reference materials, standards for quality control, and steps for instrument calibration to ensure accuracy.
    6. Procedure: Offers step-by-step instructions for conducting the analysis, including sample preparation, injection, instrument operation, and troubleshooting.
    7. Quality Control and Data Validation: Focuses on performance checks, validation criteria, and measures to ensure data reliability.
    8. Data Analysis and Reporting: Describes how results should be processed, analyzed, and documented, often including specific formats or tools.
    9. Limitations and Interferences: Identifies constraints of the method, such as compounds that may not be detected or interferences that could affect results.
    10. Health and Environmental Safety: Emphasizes precautions to protect operators and minimize environmental impact during the procedure.
    11. Documentation and References: Specifies record-keeping requirements and cites foundational literature or external standards that the SOP builds upon.
    12. Special Precautions and Handling: Covers specific handling instructions for sens
    Use the following context to create the '{section}' section for the SOP:
    Context:
    {context_text}

    Section: {section}
    """
    model = ChatOpenAI(model="gpt-4", openai_api_key=API_KEY)
    response = model.invoke(prompt)

    return response.content if response else "Error generating this section"

# Step 5: Extract content from PDF (without images)
def extract_manual_content(pdf_path, max_pages=10):
    doc = fitz.open(pdf_path)
    text = []

    for page_num in range(min(len(doc), max_pages)):
        page = doc[page_num]
        text.append(page.get_text())

    return "\n".join(text)

# Step 6: Save SOP to Word (without images)
def save_sop_to_word(sop_text: str, output_file: str):
    doc = WordDocument()
    doc.add_heading("Standard Operating Procedure (SOP)", level=1)

    # Split SOP text into sections
    sections = sop_text.split("\n\n")
    for section in sections:
        if ":" in section:
            header, content = section.split(":", 1)
            doc.add_heading(header.strip(), level=2)
            doc.add_paragraph(content.strip())
        else:
            doc.add_paragraph(section.strip())

    doc.save(output_file)
    print(f"SOP successfully saved to {output_file}")

# Step 7: Generate the full SOP
def generate_full_sop(query_text: str, manual_file: str, output_file: str):
    sop_text = ""
    sop_sections = [
        "Scope",
        "Background",
        "Safety",
        "Materials Required",
        "Standards and Controls",
        "Calibration",
        "Procedures",
        "Sampling",
        "Calculations",
        "Uncertainty of Measurement",
        "Limitations",
        "Documentation",
        "References",
    ]

    for section in sop_sections:
        print(f"Generating section: {section}")
        section_text = generate_sop_section(query_text, manual_file, section)
        sop_text += f"\n\n{section}:\n{section_text}"

    save_sop_to_word(sop_text, output_file)
    print("SOP generation completed.")

# Step 8: Hide one SOP
def hide_sop(sop_name: str, source_dir: str, hidden_dir: str):
    sop_path = os.path.join(source_dir, sop_name)
    hidden_path = os.path.join(hidden_dir, sop_name)

    if not os.path.exists(hidden_dir):
        os.makedirs(hidden_dir)

    if os.path.exists(sop_path):
        try:
            shutil.copy(sop_path, hidden_path)
            os.remove(sop_path)
            print(f"SOP '{sop_name}' has been hidden.")
        except PermissionError as e:
            print(f"Permission error: {e}. Ensure the file is not open in another program.")
        except Exception as e:
            print(f"An unexpected error occurred while hiding SOP: {e}")
    else:
        print(f"SOP '{sop_name}' not found in '{source_dir}'.")

# Step 9: Extract and Compare File Content
def extract_text_from_docx(docx_path):
    doc = WordDocument(docx_path)
    text = []
    for paragraph in doc.paragraphs:
        text.append(paragraph.text)
    return "\n".join(text)

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = []
    for page in doc:
        text.append(page.get_text())
    return "\n".join(text)

def calculate_similarity(file1_path: str, file2_path: str):
    if file1_path.endswith('.docx'):
        file1_content = extract_text_from_docx(file1_path)
    elif file1_path.endswith('.pdf'):
        file1_content = extract_text_from_pdf(file1_path)
    else:
        raise ValueError(f"Unsupported file format: {file1_path}")

    if file2_path.endswith('.docx'):
        file2_content = extract_text_from_docx(file2_path)
    elif file2_path.endswith('.pdf'):
        file2_content = extract_text_from_pdf(file2_path)
    else:
        raise ValueError(f"Unsupported file format: {file2_path}")

    vectorizer = TfidfVectorizer().fit_transform([file1_content, file2_content])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity(vectors)
    return similarity[0, 1]

# Main function
def main():
    hide_sop(HIDDEN_SOP, DATA_PATH, "hidden_sops")

    print("Step 1: Loading documents...")
    documents = load_documents()

    print("Step 2: Splitting documents into chunks...")
    chunks = split_documents(documents)

    print("Step 3: Adding chunks to Chroma database...")
    add_to_chroma(chunks)

    print("Step 4: Generating SOP for an instrument...")
    instrument_query = "Write the SOP for ICPMS Liquids"
    generate_full_sop(instrument_query, MANUAL_FILE, OUTPUT_FILE)

    hidden_sop_path = os.path.join("hidden_sops", HIDDEN_SOP)
    similarity_score = calculate_similarity(OUTPUT_FILE, hidden_sop_path)
    print(f"Similarity score between generated and actual SOP: {similarity_score:.2f}")

if __name__ == "__main__":
    main()




SOP 'ICPMS_SOPs_Liquids.pdf' not found in 'SOPS'.
Step 1: Loading documents...


Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
invalid pdf header: b'\r\n%PD'
incorrect startxref pointer(1)
parsing for Object Streams
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 45 0 (offset 0)


Step 2: Splitting documents into chunks...
Step 3: Adding chunks to Chroma database...
Step 4: Generating SOP for an instrument...
Generating section: Scope
Generating section: Background
Generating section: Safety
Generating section: Materials Required
Generating section: Standards and Controls
Generating section: Calibration
Generating section: Procedures
Generating section: Sampling
Generating section: Calculations
Generating section: Uncertainty of Measurement
Generating section: Limitations
Generating section: Documentation
Generating section: References
SOP successfully saved to Generated_SOP_for_ICP.docx
SOP generation completed.
Similarity score between generated and actual SOP: 0.82
