In [None]:
!pip install pandas openpyxl PyPDF2 langchain huggingface_hub faiss-cpu groq sympy transformers pypdf langchain-community langchain-groq


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting groq
  Downloading groq-0.19.0-py3-none-any.whl.metadata (15 kB)
Collecting pypdf
  Downloading pypdf-5.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-groq
  Downloading langchain_groq-0.2.5-py3-none-any.whl.metadata (2.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dat

In [None]:
from google.colab import files
uploaded = files.upload()


Saving CELEX_52013XC0802(04)_EN_TXT (1) GUIDELINE.pdf to CELEX_52013XC0802(04)_EN_TXT (1) GUIDELINE.pdf
Saving Site transfer - EU - data base.xlsx to Site transfer - EU - data base.xlsx


In [None]:
import os
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq

In [None]:
def load_excel_metadata(file_path="Site transfer - EU - data base.xlsx"):
    """
    Load metadata from the Excel file and structure it into a dictionary.

    Parameters:
        file_path (str): Path to the Excel file.

    Returns:
        dict: A structured dictionary containing metadata for each operation.
    """
    import pandas as pd

    try:
        # Load the Excel file
        excel_data = pd.read_excel(file_path, sheet_name="Feuil1", header=0)
    except Exception as e:
        print(f"Error loading Excel file: {e}")
        return {}

    metadata = {}

    for _, row in excel_data.iterrows():
        # Extract and consolidate grouped operations
        addition_replacement = (
            f"Description: {row.get('Change of FP manufacturing site (addition/replacement)', 'Not specified')}\n"
            f"Specific Conditions: {row.get('Change of FP manufacturing site (addition/replacement): specific conditions', 'Not specified')}\n"
            f"Required Documentation: {row.get('Change of FP manufacturing site (addition/replacement): required documentation', 'Not specified')}"
        )

        deletion = (
            f"Description: {row.get('Deletion (suppression) of FP manufacturing site', 'Not specified')}\n"
            f"Specific Conditions: {row.get('Deletion (suppression) of FP manufacturing site: specific conditions', 'Not specified')}\n"
            f"Required Documentation: {row.get('Deletion (suppression) of FP manufacturing site: required documentation', 'Not specified')}"
        )

        packaging_batch_operations = (
            f"Packaging Operations Description: {row.get('Change of FP manufacturing site (for packaging operations)', 'Not specified')}\n"
            f"Packaging Operations Specific Conditions: {row.get('Change of FP manufacturing site (for packaging operations): specific conditions', 'Not specified')}\n"
            f"Packaging Operations Required Documentation: {row.get('Change of FP manufacturing site (for packaging operations): required documentation', 'Not specified')}\n\n"
            f"Batch Release/Control Operations Description: {row.get('Change of FP manufacturing site (for batch release/control operations)', 'Not specified')}\n"
            f"Batch Release/Control Operations Specific Conditions: {row.get('Change of FP manufacturing site (for batch release/control operations): specific conditions', 'Not specified')}\n"
            f"Batch Release/Control Operations Required Documentation: {row.get('Change of FP manufacturing site (for batch release/control operations): required documentation', 'Not specified')}"
        )

        # Add the structured metadata to the dictionary
        metadata[row["Country/Region"]] = {
            "Is registration of multiple drug manufacturers allowed?": row.get("Is registration of multiple drug manufacturers allowed?", "Not specified"),
            "Change of FP Manufacturing Site (Addition/Replacement)": addition_replacement,
            "Deletion of FP Manufacturing Site": deletion,
            "Change of FP Manufacturing Site (Packaging/Batch Operations)": packaging_batch_operations,
            "How to submit the documentation?": row.get("How to submit the documentation? (generic information on submission package)", "Not specified"),
            "Timelines": row.get("Timelines (agency procedural/sponsor implementation)", "Not specified"),
            "Regulatory Summary": row.get("Regulatory Summary", "Not specified"),
            "Reference Documents": row.get("Reference Document(s)", "Not specified"),
        }

    return metadata




In [None]:
# Load data from the PDF file
from pypdf import PdfReader
def load_pdf_content(file_path="/content/CELEX_52013XC0802(04)_EN_TXT (1) GUIDELINE.pdf"):
    """
    Load content from the PDF file.

    Parameters:
        file_path (str): Path to the PDF file.

    Returns:
        str: Combined text content from the PDF file.
    """
    try:
        from pypdf import PdfReader
        with open(file_path, "rb") as pdf_file:
            pdf_reader = PdfReader(pdf_file)
            pdf_content = "\n".join([page.extract_text() for page in pdf_reader.pages])
    except Exception as e:
        print(f"Error loading PDF file: {e}")
        pdf_content = ""

    return pdf_content

In [None]:
def load_data_from_files():
    """
    Combine content from the Excel and PDF files.
    Returns:
        list: List of Document objects containing combined content.
    """
    excel_metadata = load_excel_metadata()
    pdf_content = load_pdf_content()

    # Convert Excel metadata to structured text
    excel_documents = [
        Document(page_content=f"{region}: {details['Change of FP Manufacturing Site (Addition/Replacement)']}")
        for region, details in excel_metadata.items()
    ]

    # Convert PDF content to documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    pdf_documents = [Document(page_content=t) for t in text_splitter.split_text(pdf_content)]

    # Combine Excel and PDF documents
    combined_documents = excel_documents + pdf_documents
    return combined_documents

In [None]:
def create_vector_database(documents):
    """
    Create a vector database using FAISS and sentence-transformers embeddings.
    Parameters:
        documents (list): List of Document objects.
    Returns:
        FAISS: Vector database object.
    """
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db = FAISS.from_documents(documents, embeddings)
    return db

In [None]:
def create_qa_chain(vector_db, api_key, model_name):
    """
    Create a RetrievalQA chain with Groq LLM.

    Parameters:
        vector_db (FAISS): Vector database object.
        api_key (str): API key for Groq LLM.
        model_name (str): Model name for Groq LLM.

    Returns:
        RetrievalQA: QA chain object.
    """
    # Initialize Groq language model
    groq_llm = ChatGroq(api_key=api_key, model_name=model_name)

    # Define prompt template
    template = """You are an AI assistant that provides medical regulatory information.
    Use the following context to answer the question at the end.
    Context: {context}

    Ensure your answers STRICTLY follow this format, replacing placeholders with actual information:

    - Addition of primary packaging site:
      product_type:
      Country:
      climatic_zone:
      classification_number:
      Required conditions:
      variation_type:
      Lead time:
      CMC dossier required:
      CMC dossier type:
      M1 required documents:
      M2 required documents:
      M3 required documents:
      M4 required documents:
      M5 required documents:
      Local Specific required documents:
      Fees requested:
      Samples requirement:
      Valid Period of MA required:

    - Addition of secondary packaging site:
      product_type:
      Country:
      climatic_zone:
      classification_number:
      Required conditions:
      variation_type:
      Lead time:
      CMC dossier required:
      CMC dossier type:
      M1 required documents:
      M2 required documents:
      M3 required documents:
      M4 required documents:
      M5 required documents:
      Local Specific required documents:
      Fees requested:
      Samples requirement:
      Valid Period of MA required:

    - Addition of a manufacturer responsible for batch release (not including batch control/testing):
      product_type:
      Country:
      climatic_zone:
      classification_number:
      Required conditions:
      variation_type:
      Lead time:
      CMC dossier required:
      CMC dossier type:
      M1 required documents:
      M2 required documents:
      M3 required documents:
      M4 required documents:
      M5 required documents:
      Local Specific required documents:
      Fees requested:
      Samples requirement:
      Valid Period of MA required:
    - Addition of a manufacturer responsible for batch release (including batch control/testing):
      product_type:
      Country:
      climatic_zone:
      classification_number:
      Required conditions:
      variation_type:
      Lead time:
      CMC dossier required:
      CMC dossier type:
      M1 required documents:
      M2 required documents:
      M3 required documents:
      M4 required documents:
      M5 required documents:
      Local Specific required documents:
      Fees requested:
      Samples requirement:
      Valid Period of MA required:

    - Grouping:
      product_type:
      Country:
      climatic_zone:
      classification_number:
      Required conditions:
      variation_type:
      Lead time:
      CMC dossier required:
      CMC dossier type:
      M1 required documents:
      M2 required documents:
      M3 required documents:
      M4 required documents:
      M5 required documents:
      Local Specific required documents:
      Fees requested:
      Samples requirement:
      Valid Period of MA required:

    Question: {question}
    Answer:
    """

    PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])

    # Create the RetrievalQA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=groq_llm,
        chain_type="stuff",
        retriever=vector_db.as_retriever(),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT},
    )
    return qa_chain


def ask_chatbot(qa_chain, question):
    """
    Ask the chatbot a question and get a structured response.

    Parameters:
        qa_chain (RetrievalQA): QA chain object.
        question (str): The question to ask.

    Returns:
        str: Structured response from the chatbot.
    """
    response = qa_chain.invoke({"query": question})
    return response




In [None]:
def main():
    # Step 1: Load and preprocess data
    documents = load_data_from_files()

    # Step 2: Create vector database
    vector_db = create_vector_database(documents)

    # Step 3: Create QA chain with Groq LLM
    api_key = "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"
    model_name = "llama3-8b-8192"
    qa_chain = create_qa_chain(vector_db, api_key, model_name)

    # Step 4: Ask a question
    question = "What are the regulatory requirements in Europe for adding an alternative site for drug product packaging and release for Chemicals (Small Molecules)"
    answer = ask_chatbot(qa_chain, question)
    print(answer)




In [None]:
if __name__ == "__main__":
    main()

{'query': 'What are the regulatory requirements in Europe for adding an alternative site for drug product packaging and release for Chemicals (Small Molecules)', 'result': 'Here is the answer:\n\n**Addition of primary packaging site:**\n\n* product_type: Chemicals (Small Molecules)\n* Country: Any country within the European Union (EU) or European Economic Area (EEA)\n* climatic_zone: N/A\n* classification_number: N/A\n* Required conditions: The site must be appropriately authorized for the pharmaceutical form or product concerned.\n* variation_type: Type IB\n* Lead time: N/A (dependent on the complexity of the dossier)\n* CMC dossier required: Yes\n* CMC dossier type: Type II\n* M1 required documents: 1. Proof that the proposed site is appropriately authorized for the pharmaceutical form or product concerned.\n* M2 required documents: 2. Where relevant, the batch numbers, corresponding batch size, and the manufacturing date of batches (> 3) used in the validation study should be indic

In [None]:
def save_vector_database(vector_db, index_path="vector_db.index", metadata_path="vector_db_metadata.pkl", index_to_docstore_id_path="vector_db_index_to_docstore_id.pkl"):
    """
    Save the vector database to disk.
    Parameters:
        vector_db (FAISS): Vector database object.
        index_path (str): Path to save the FAISS index.
        metadata_path (str): Path to save the metadata.
        index_to_docstore_id_path (str): Path to save the index_to_docstore_id mapping.
    """
    import faiss
    import pickle

    # Save the FAISS index
    faiss.write_index(vector_db.index, index_path)

    # Save the docstore (metadata)
    with open(metadata_path, "wb") as f:
        pickle.dump(vector_db.docstore._dict, f)

    # Save the index_to_docstore_id mapping
    with open(index_to_docstore_id_path, "wb") as f:
        pickle.dump(vector_db.index_to_docstore_id, f)
    print("Vector database saved successfully.")



In [None]:
    # Step 1: Load and preprocess data
    documents = load_data_from_files()

    # Step 2: Create vector database
    vector_db = create_vector_database(documents)

    # Step 3: Save the vector database
    save_vector_database(vector_db)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector database saved successfully.
