In [7]:
!/opt/anaconda3/bin/python -m pip install python-docx

python(46544) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Collecting python-docx
  Using cached python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Using cached python_docx-1.1.2-py3-none-any.whl (244 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2


In [2]:
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [1]:
%pip install -Uq "unstructured[all-docs]" pillow lxml 

Note: you may need to restart the kernel to use updated packages.


In [1]:
import requests 
import json
import docx
import os
import pandas as pd
import json
from tabulate import tabulate
import signal
from contextlib import contextmanager

In [3]:
url = "http://127.0.0.1:11434/api/generate"

payload = {
   "model": "llama3.1",
  "prompt": "Hello! Who are you?",
  "stream": False
}

try:
   response = requests.post(url, json=payload)
   response.raise_for_status()
   result = response.json()
   print(result["response"])
except requests.exceptions.RequestException as e:
   print(f"Error: {e}")

I'm an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."


In [2]:
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex, Document
from llama_index.core.response_synthesizers import CompactAndRefine 
from llama_index.core.prompts import PromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#!brew install poppler tesseract libmagic

In [4]:
#%pip uninstall -y charset_normalizer pdfminer.six unstructured unstructured-inference unstructured-client

In [5]:
#%pip install -Uq "unstructured[all-docs]" "unstructured[pdf]" pdfminer.six lxml Pillow

In [6]:
from unstructured.partition.pdf import partition_pdf

ModuleNotFoundError: No module named 'charset_normalizer'

In [7]:
def extract_text_from_docx(file_path):
    """Extracts text from a DOCX file."""
    text = ""
    try:
        doc = docx.Document(file_path)
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text
    except FileNotFoundError:
        print(f"Error: DOCX file not found at {file_path}")
        return None
    except Exception as e:
        print(f"Error extracting text from DOCX: {e}")
        return None

In [8]:
def read_document_content(file_path):
    """
    Reads text from a PDF or DOCX file based on its extension.
    Returns the extracted text as a string.
    """
    if file_path.lower().endswith('.docx'):
        return extract_text_from_docx(file_path)
    else:
        print("Unsupported file format. Please provide a .pdf or .docx file.")
        return None

In [9]:
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
Settings.llm = Ollama(model="llama3.1", request_timeout=200.0)

In [10]:
def timeout_context(seconds):
    """Context manager for timeout handling"""
    def timeout_handler(signum, frame):
        raise TimeoutError(f"Operation timed out after {seconds} seconds")
    
    # Set the signal handler and alarm
    signal.signal(signal.SIGALRM, timeout_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)

In [11]:
if __name__ == '__main__':
    base_directory_path = "/Users/Sravya/Desktop/AI_model_table_shells"
    file_name = "Table shell standard.docx" # Make sure this matches your file

    file_path = os.path.join(base_directory_path, file_name)

    if not os.path.exists(file_path):
        print(f"Error: Document file not found at {file_path}. Please ensure the file exists and the 'file_name' variable is correct.")
        
    print(f"Reading document: {file_path}")
    document_content_text = read_document_content(file_path)

    if document_content_text:
        print("\n--- Document Content Extracted (partial view) ---")
        print(document_content_text[:500] + "..." if len(document_content_text) > 500 else document_content_text)
        print("--------------------------------------------------\n")

        print("--- Setting up RAG Pipeline ---")

        document_for_rag = Document(text=document_content_text)
        print("Created LlamaIndex Document object.")

        print("Creating VectorStoreIndex (this involves embedding the document)...")
        index = VectorStoreIndex.from_documents([document_for_rag], embed_model=Settings.embed_model)
        print("VectorStoreIndex created.")

        print("Creating QueryEngine...")
        
        # --- ADJUSTED CUSTOM_QA_TEMPLATE ---
        # Define the template string
        custom_qa_template_str = """Context information is below.
---------------------
{context_str}
---------------------
Given the context information, answer the query.

If the query asks for a general characteristic (e.g., "height", "weight", "ethnicity"), provide all associated metrics (e.g., 'n', 'Mean', 'Standard Deviation', 'Median', 'Min, Max' for height/weight) and their values across all relevant columns ('Treatment A', 'Treatment B', 'Total'), strictly in JSON format. Organize the JSON by the characteristic, then by metric, then by column.

If the query asks for a specific metric or cell value (e.g., "Mean height for Treatment A", "Q1 Sales for Product A"), provide only that specific value in JSON.

If you cannot find the information, return an empty JSON object {{}}.

Example JSON for a general characteristic query (e.g., "height"):
{{
  "Height (cm)": {{
    "n": {{ "Treatment A": "xx", "Treatment B": "xx", "Total": "xx" }},
    "Mean": {{ "Treatment A": "xxx.xx", "Treatment B": "xxx.xx", "Total": "xxx.xx" }},
    "Standard Deviation": {{ "Treatment A": "xxx.xxx", "Treatment B": "xxx.xxx", "Total": "xxx.xxx" }},
    "Median": {{ "Treatment A": "xxx.xx", "Treatment B": "xxx.xx", "Total": "xxx.xx" }},
    "Min, Max": {{ "Treatment A": "xxx, xxx", "Treatment B": "xxx, xxx", "Total": "xxx, xxx" }}
  }}
}}

Example JSON for a specific metric query: {{"Mean height for Treatment A": "xxx.xx"}}

Query: {query_str}
"""
        # Wrap the string in a PromptTemplate object
        custom_qa_template = PromptTemplate(custom_qa_template_str)
        
        query_engine = index.as_query_engine(
            llm=Settings.llm,
            response_synthesizer=CompactAndRefine(text_qa_template=custom_qa_template)
        )
        print("QueryEngine created.")

        # --- Example Queries (Adjusted for the new output expectation) ---
        queries = [
            "Give me the height table", # General characteristic query
        ]

        print("\n--- Running Queries with RAG ---")
        for i, query_text in enumerate(queries):
            print(f"--- Query {i+1}: {query_text} ---")
            
            try:
                response = query_engine.query(query_text)
                llm_response_text = response.response
                
                print("LLM Response (attempted JSON):")
                try:
                    parsed_json = json.loads(llm_response_text)
                    print(json.dumps(parsed_json, indent=2))
                except json.JSONDecodeError:
                    print("Could not parse as JSON. Raw response:")
                    print(llm_response_text)
                    
            except Exception as e:
                print(f"Error during query {i+1}: {e}")
            print("------------------------------------------\n")

Reading document: /Users/Sravya/Desktop/AI_model_table_shells/Table shell standard.docx

--- Document Content Extracted (partial view) ---

Table 14.1.2.1: Demographic and Baseline Characteristics
Full Analysis Set

Notes: The baseline value is defined as the last non-missing value before initial administration of study drug.
[a] Age in years is calculated using the date of birth and date of informed consent.
Source Data: adam.adsl; Listing 16.2.4.x.x.
. Table 14.1.2.1: Demographic and Baseline Characteristics
Full Analysis Set

Notes: The baseline value is defined as the last non-missing value before initial administration of stud...
--------------------------------------------------

--- Setting up RAG Pipeline ---
Created LlamaIndex Document object.
Creating VectorStoreIndex (this involves embedding the document)...
VectorStoreIndex created.
Creating QueryEngine...
QueryEngine created.

--- Running Queries with RAG ---
--- Query 1: Give me the height table ---
LLM Response (attempte