In [0]:
%pylab inline

In [0]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd

In [0]:
# Example: load a DSS dataset as a Pandas dataframe
mydataset = dataiku.Dataset("data_chunked")
mydataset_df = mydataset.get_dataframe()

In [0]:
import dataiku
client = dataiku.api_client()
project = client.get_default_project()
llm_list = project.list_llms()
for llm in llm_list:
    print(f"- {llm.description} (id: {llm.id})")

In [0]:
connection_name = "iliad-plugin-conn-prod" 
connection = client.get_connection(connection_name)
connection_info = connection.get_info()
connection_params = connection_info["params"]
models = connection_params['models']
for model in models:
    print(f"{model['capability']} {model} \n")

# Query on image Multipart message

In [0]:
import dataiku
import os
import base64
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt

# Create a handle for the LLM model
client = dataiku.api_client()
project = client.get_default_project()
LLM_MODEL_ID = "custom:iliad-plugin-conn-prod:Claude_3_5_Sonnet"
# LLM_MODEL_ID = "custom:iliad-plugin-conn-prod:stable-diffusion-3-large"
# LLM_MODEL_ID = "custom:iliad-plugin-conn-prod:gpt-4o"
llm_model = project.get_llm(LLM_MODEL_ID)

# llm = llm_model.as_langchain_llm()
# Read image from a Dataiku managed folder
folder = dataiku.Folder("input_images_extracted_custom")
image_filename = "Phase 1 Requirements - PSIT Patient Journey AI Model_image1.jpeg"

import base64
from IPython.display import Image, display

def display_base64_image(base64_code):
    # Decode the base64 string to binary
    image_data = base64.b64decode(base64_code)
    # Display the image
    display(Image(data=image_data))


# Read the image data
with folder.get_download_stream(image_filename) as stream:
    image_data = stream.read()

# Convert image to base64
img_base64 = base64.b64encode(image_data).decode("utf-8")

display_base64_image(img_base64)

# --- Step 2: Create and execute the completion request ---
completion = llm_model.new_completion()
mp_message = completion.new_multipart_message()
prompt_text = (
    """First convert the base64 image into original image and then Extract all the metrics and text present 
    in the provided image and create a detailed summary of the image 
    even if its blurry i want you to just extract every character present in image."""
)
mp_message.with_text(prompt_text)
# mp_message.with_text(f"Here is the image in base64 format:\n{img_base64}") 

# adding the image using with_inline_image method.
mp_message.with_inline_image(image_data)
# mp_message.with_inline_image(img_base64)

# Add the message to the completion request.
mp_message.add()

# Execute the completion request
print("Executing LLM request...")
resp = completion.execute()

# Debugging: Print response structure
print("Raw response object type:", type(resp))
print("Success:", resp.success)

# Extract response text correctly
if hasattr(resp, "text"):
    print("Response from LLM:", resp.text)
else:
    print("Response object does not have 'text'. Full response:", resp)

# Check full JSON response
try:
    response_json = resp.json()
    print("Full JSON response:", response_json)
except Exception as e:
    print("Error extracting JSON response:", e)


In [0]:
help(mp_message)

In [0]:
# # Initialize the processor
# folder_id = "Knowledge_bank"
# output_folder_id = "images_extracted"
# processor = FileProcessor(folder_id, output_folder_id)

# # Get list of files
# folder = dataiku.Folder(folder_id)
# file_list = folder.list_paths_in_partition()

# # Process files
# results_df = processor.process_all_files(file_list)
    

In [0]:
import dataiku
import os
import base64
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
from IPython.display import Image as IPImage, display

# Create a handle for the LLM model
client = dataiku.api_client()
project = client.get_default_project()
LLM_MODEL_ID = "custom:iliad-plugin-conn-prod:Claude_3_5_Sonnet"
llm_model = project.get_llm(LLM_MODEL_ID)

# Read image from a Dataiku managed folder
folder = dataiku.Folder("input_images_extracted_custom")
image_filename = "Phase 1 Requirements - PSIT Patient Journey AI Model_image1.jpeg"

# Read the image data
with folder.get_download_stream(image_filename) as stream:
    image_data = stream.read()

# Convert to base64 for display purposes and for use in the API
img_base64 = base64.b64encode(image_data).decode("utf-8")
print(f"Image {image_filename} loaded successfully")

# Convert to Langchain LLM
print("Converting to Langchain LLM...")
llm = llm_model.as_langchain_llm()
print(f"LLM type: {type(llm)}")

# Import Langchain components for message handling
from langchain.schema import HumanMessage, SystemMessage

# Create a more specific system message
system_message = SystemMessage(content="""You are a precise document analysis assistant specializing in text extraction from images.
Your task is to extract ALL text content visible in the provided image with 100% accuracy.
- Include ALL text, even if partially visible, small, or in different orientations
- Maintain the original structure, formatting, and layout as much as possible
- Extract all numbers, metrics, and values exactly as they appear
- If text is in tables, preserve the table structure
- Be comprehensive and detailed - do not summarize or omit any text
- If you're uncertain about any text, indicate this with [unclear text] but make your best guess
""")

# Create a more detailed human message
human_message = HumanMessage(
    content=[
        {"type": "text", "text": """Extract ALL text content from this document image. 
Be extremely thorough and extract every single character, number, heading, and label visible in the image.
Format your response to preserve the original document layout as much as possible.
Use formatting like headings, bullet points, and indentation to maintain the structure.
DO NOT summarize or generalize - I need the EXACT text content as written in the document.
This appears to be a technical requirements document - please extract all requirements, metrics, and technical details completely."""},
        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}}
    ]
)

print("Sending request to Claude via Langchain...")
try:
    # Send the request with temperature=0 for more precise extraction
    response = llm.invoke([system_message, human_message], config={"temperature": 0})
    
    # Print the response
    print("\n===== CLAUDE'S ANALYSIS =====")
    print(response)  # The response is already a string, no need for .content
    print("=============================\n")
    
    # Save the response to a dataset
    try:
        output_dataset = dataiku.Dataset("extracted_text_claude")
        with output_dataset.get_writer() as writer:
            writer.write_row_dict({
                "filename": image_filename,
                "extracted_text": response,  # The response is already a string
                "extraction_method": "Claude with Enhanced Prompting"
            })
        print("Results saved to dataset 'extracted_text_claude'")
    except Exception as e:
        print(f"Error saving to dataset: {e}")
        
    # Optionally process multiple images
    process_all_images = False  # Set to True to process all images in the folder
    
    if process_all_images:
        print("\nProcessing all images in the folder...")
        all_files = folder.list_paths_in_partition()
        image_files = [f for f in all_files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        
        for img_file in image_files:
            if img_file == image_filename:
                continue  # Skip the one we already processed
                
            print(f"Processing {img_file}...")
            with folder.get_download_stream(img_file) as stream:
                img_data = stream.read()
            
            img_b64 = base64.b64encode(img_data).decode("utf-8")
            
            human_msg = HumanMessage(
                content=[
                    {"type": "text", "text": f"Extract ALL text from this image ({img_file}). Be comprehensive and extract every single character, including numbers, headings, and labels."},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}
                ]
            )
            
            try:
                img_response = llm.invoke([system_message, human_msg], config={"temperature": 0})
                
                # Save to dataset
                with output_dataset.get_writer() as writer:
                    writer.write_row_dict({
                        "filename": img_file,
                        "extracted_text": img_response,  # Already a string
                        "extraction_method": "Claude with Enhanced Prompting"
                    })
                print(f"Successfully processed {img_file}")
            except Exception as e:
                print(f"Error processing {img_file}: {e}")
    
except Exception as e:
    print(f"Error sending request: {e}")
    
print("\nProcessing complete!")

In [0]:
help(llm)

Testing - VS

In [0]:
connection_name = "iliad-plugin-conn-prod" 
connection = client.get_connection(connection_name)
connection_info = connection.get_info()
connection_params = connection_info["params"]
models = connection_params['models']
for model in models:
    print(f"Model ID: {model.get('id')}")
    print(f"Capability: {model.get('capability')}")
    print(f"Type: {model.get('type')}")
    print()

In [0]:
import dataikuapi

client = dataikuapi.APINodeClient("https://cdl-dku-genai01.commercial-datalake-prod.awscloud.abbvienet.com/", "mlopsChatbot")

result = client.run_function("generate",
        user_query = "What are support programs currently available under each drug brand?")
print("Function result: %s" % result.get("response"))


In [0]:
import requests
import json

# Replace with your actual Dataiku webapp URL LmdRX0E_qaapi
api_url = "https://cdl-dku-desi-p.commercial-datalake-prod.awscloud.abbvienet.com/projects/GENAIPOC/webapps/qa_api"

# Health check
health_response = requests.get(f"{api_url}/health")
print("Health check:", health_response.json())

# Query the system
query_payload = {"query": "Who to contact for rinvoq reactions?"}
query_response = requests.post(
    f"{api_url}/query",
    headers={"Content-Type": "application/json"},
    data=json.dumps(query_payload)
)
print("Query response:", query_response.json())

In [0]:
import dataiku
import json
import traceback

# Import the DataikuQASystem class - if this is a custom class you've created
from DataikuQASystem import DataikuQASystem

# Environment settings
PROJECT_KEY = "GENAIPOC"
WEBAPP_ID = "LmdRX0E_qaapi"  # Your original QA API webapp ID

# Hardcoded configuration values for QA system
KB_ID = "dV3dIQCo"  # Your knowledge bank ID
EMBEDDING_MODEL = "custom:iliad-plugin-conn-prod:text-embedding-ada-002"
LLM_MODEL = "custom:iliad-plugin-conn-prod:gpt-4o"
NUM_DOCS = 5  # Number of documents to retrieve

# Initialize the QA system
try:
    qa_system = DataikuQASystem(
        kb_id=KB_ID,
        embedding_model_name=EMBEDDING_MODEL,
        llm_model_name=LLM_MODEL,
        k=NUM_DOCS
    )
    print(f"QA System successfully initialized with KB: {KB_ID}")
    system_ready = True
except Exception as e:
    print(f"Error initializing QA system: {str(e)}")
    traceback.print_exc()
    system_ready = False

# Connect to Dataiku API client using the working pattern
try:
    # Use the client that works in your environment
    client = dataiku.api_client()
    project = client.get_project(PROJECT_KEY)
    print(f"Successfully connected to project {PROJECT_KEY}")
except Exception as e:
    print(f"Error initializing project: {str(e)}")
    traceback.print_exc()
    exit(1)

try:
    webapp = project.get_webapp(WEBAPP_ID)
    print(f"Successfully connected to webapp {WEBAPP_ID}")
except Exception as e:
    print(f"Error initializing webapp: {str(e)}")
    traceback.print_exc()
    exit(1)

try:
    backend = webapp.get_backend_client()
    backend.session.headers['Content-Type'] = 'application/json'
    print("Backend client successfully initialized")
except Exception as e:
    print(f"Error initializing backend: {str(e)}")
    traceback.print_exc()
    exit(1)

def check_health():
    """Check the health of the QA system"""
    try:
        health_response = backend.session.get(backend.base_url + '/health')
        print(f"Health check response: {health_response.text}")
        return health_response.text
    except Exception as e:
        print(f"Health check failed: {str(e)}")
        traceback.print_exc()
        return json.dumps({"status": "error", "message": str(e)})

def query_qa_system(question):
    """Query the QA system with a question"""
    try:
        response = backend.session.post(
            backend.base_url + '/query', 
            json={'message': question}
        )
        print(f"Query response: {response.text}")
        return response.text
    except Exception as e:
        print(f"Query failed: {str(e)}")
        traceback.print_exc()
        return json.dumps({"status": "error", "message": str(e)})

# Example usage
if __name__ == "__main__":
    # Test health check
    print("\n--- Health Check ---")
    health_result = check_health()
    
    # Test query
    print("\n--- Test Query ---")
    question = "Who to contact for rinvoq reactions?"
    answer = query_qa_system(question)
    
    # Try to parse the response
    try:
        answer_json = json.loads(answer)
        if "response" in answer_json:
            print(f"\nFormatted Answer: {answer_json['response']}")
    except json.JSONDecodeError:
        print("Could not parse response as JSON")

In [0]:
import dataiku, dataikuapi


# Environment settings
DSS_LOCATION = "https://cdl-dku-desi-p.commercial-datalake-prod.awscloud.abbvienet.com"
API_KEY = "dkuaps-9ALuuZLhFJg9dTcrSgPMcsdtfP8bpPXC"
PROJECT_KEY = "GENAIPOC"
WEBAPP_ID = "LmdRX0E" 

# Depending on your case, use one of the following

#client = dataikuapi.DSSClient(DSS_LOCATION, API_KEY)
client = dataiku.api_client()

try:
    project = client.get_project(PROJECT_KEY)
except:
    print("error initializing project")
try:
    webapp = project.get_webapp(WEBAPP_ID)
except:
    print("error initializing webapp")
try:
    backend = webapp.get_backend_client()
except:
    print("error initializing backend")

backend.session.headers['Content-Type'] = 'application/json'

resp = backend.session.post(backend.base_url + '/query', json={'message':'Who to contact for rinvoq reactions?'})

print(backend.base_url + '/query')
# Query the LLM
print(resp.text)

# test-headless

In [5]:
import dataiku, dataikuapi


# Environment settings
DSS_LOCATION = "https://cdl-dku-desi-p.commercial-datalake-prod.awscloud.abbvienet.com"
API_KEY = "dkuaps-9ALuuZLhFJg9dTcrSgPMcsdtfP8bpPXC"
PROJECT_KEY = "GENAIPOC"
WEBAPP_ID = "gq110S2" 

# Depending on your case, use one of the following

#client = dataikuapi.DSSClient(DSS_LOCATION, API_KEY)
client = dataiku.api_client()

try:
    project = client.get_project(PROJECT_KEY)
except:
    print("error initializing project")
try:
    webapp = project.get_webapp(WEBAPP_ID)
except:
    print("error initializing webapp")
try:
    backend = webapp.get_backend_client()
except:
    print("error initializing backend")

backend.session.headers['Content-Type'] = 'application/json'


# Prepare and send payload to /query_on_docs
payload = {
   "files": [
      {
         "filename": "file1.txt",
         "content": "SGVsbG8gd29ybGQh"
      },
      {
         "filename": "file2.txt",
         "content": "VGhpcyBpcyBhIHRlc3QgZmlsZS4="
      }
   ]
}
response = backend.session.post(backend.base_url + '/update_kb', json=payload)

# Query the LLM
print(response.text)

{"combined_text":"--- file1.txt ---\nHello world!\n\n--- file2.txt ---\nThis is a test file.\n","file_count":2,"result":{"chunks_added":0,"message":"No new data was added because all files already exist.","status":"no_new_chunks","total_rows":1272}}



In [3]:
backend.base_url

'http://10.242.92.241:10000/web-apps-backends/GENAIPOC/gq110S2/'