In [0]:
%pylab inline

# usual code

In [1]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd

In [0]:
# Example: load a DSS dataset as a Pandas dataframe
mydataset = dataiku.Dataset("mydataset")
mydataset_df = mydataset.get_dataframe()

In [0]:
import os
import shutil
import zipfile
import tempfile
import logging
import base64
from io import BytesIO
import pandas as pd
import concurrent.futures

import dataiku
from docx import Document
import pdfplumber
from pptx import Presentation
from PIL import Image


# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


class VisionLLMExtractor:
    """
    Uses Vision LLM to extract text from images.
    """
    def __init__(self, llm_model_id="custom:iliad-plugin-conn-prod:Claude_3_5_Sonnet"):
        """
        Initialize the vision LLM extractor with the specified model.
        
        Args:
            llm_model_id: ID of the LLM model to use
        """
        try:
            client = dataiku.api_client()
            project = client.get_default_project()
            self.llm_model = project.get_llm(llm_model_id)
            logger.info(f"Initialized VisionLLMExtractor with model: {llm_model_id}")
        except Exception as e:
            logger.error(f"Error initializing VisionLLMExtractor: {e}")
            self.llm_model = None
    
    def extract_text_from_image(self, image_data):
        """
        Extract text from an image using the vision LLM.
        
        Args:
            image_data: Raw image data bytes
            
        Returns:
            str: Extracted text from the image
        """
        if self.llm_model is None:
            return "Error: LLM model not initialized"
        
        try:
            # Convert image to base64
            img_base64 = base64.b64encode(image_data).decode("utf-8")
            
            # Create and execute the completion request
            completion = self.llm_model.new_completion()
            mp_message = completion.new_multipart_message()
            mp_message.with_text("Extract all text content visible in this image. Return only the extracted text without any additional commentary.")
            mp_message.with_text(f"Here is the image in base64 format:\n{img_base64}")
            mp_message.add()
            
            # Execute the completion request
            logger.info("Executing LLM request for image text extraction...")
            resp = completion.execute()
            
            # Extract response text
            if hasattr(resp, "text"):
                extracted_text = resp.text
                logger.info(f"Successfully extracted text from image: {extracted_text[:50]}...")
                return extracted_text
            else:
                logger.error("Response object does not have 'text' attribute")
                return "Error: Failed to extract text from image"
                
        except Exception as e:
            logger.error(f"Error extracting text from image: {e}")
            return f"Error extracting text from image: {e}"


class BaseExtractor:
    """
    Base extractor that handles file reading from a Dataiku Folder.
    """
    def __init__(self, folder_id):
        self.data_source = dataiku.Folder(folder_id)

    def get_file_data(self, file_path):
        """
        Reads file data from the Dataiku Folder.
        """
        with self.data_source.get_download_stream(file_path) as f:
            return f.read()


class WordExtractor(BaseExtractor):
    """
    Extracts text, tables, and images from Word documents.
    """
    def __init__(self, folder_id, vision_llm=None):
        super().__init__(folder_id)
        self.vision_llm = vision_llm
        
    def extract_text_and_tables(self, file_path):
        """
        Extracts text and table content from a Word document.
        """
        try:
            file_data = self.get_file_data(file_path)
            doc_stream = BytesIO(file_data)
            doc = Document(doc_stream)

            # Extract text from paragraphs.
            text_content = [para.text.strip() for para in doc.paragraphs if para.text.strip()]

            # Extract table data.
            table_data = []
            for table in doc.tables:
                table_content = []
                for row in table.rows:
                    row_data = [cell.text.strip() for cell in row.cells]
                    table_content.append(row_data)
                table_data.append(table_content)

            return {"text": "\n".join(text_content), "tables": table_data}
        except Exception as e:
            logger.error(f"Error reading Word document {file_path}: {e}")
            return {"error": f"Error reading Word document: {e}"}

    def extract_images(self, file_path, output_folder_id):
        """
        Extracts images from a Word document and saves them to a managed folder.
        Uses vision LLM to extract text from images if available.
        
        Returns a dict with image paths and extracted text.
        """
        try:
            file_data = self.get_file_data(file_path)
            file_name = os.path.basename(file_path)
            base_name = os.path.splitext(file_name)[0]

            # Get the output folder for images
            image_folder = dataiku.Folder(output_folder_id)
            
            # List existing files in the output folder
            existing_images = set(image_folder.list_paths_in_partition())

            # Create a temporary directory for extraction
            temp_dir = tempfile.mkdtemp()
            temp_docx_path = os.path.join(temp_dir, file_name)
            with open(temp_docx_path, "wb") as temp_file:
                temp_file.write(file_data)

            # Dictionary to store image paths and extracted text
            image_results = {}
            
            with zipfile.ZipFile(temp_docx_path, "r") as docx_zip:
                for file_info in docx_zip.infolist():
                    if file_info.filename.startswith("word/media/"):
                        image_name = os.path.basename(file_info.filename)
                        image_path = f"{base_name}_{image_name}"
                        
                        # Get image data
                        image_data = docx_zip.read(file_info.filename)
                        
                        # Process with vision LLM if available
                        extracted_text = ""
                        if self.vision_llm:
                            extracted_text = self.vision_llm.extract_text_from_image(image_data)
                        
                        # Check if the image already exists
                        if image_path not in existing_images:
                            with image_folder.get_writer(image_path) as writer:
                                writer.write(image_data)
                        
                        # Store result
                        image_results[image_path] = extracted_text

            shutil.rmtree(temp_dir, ignore_errors=True)
            return image_results
        except Exception as e:
            logger.error(f"Error extracting images from {file_path}: {e}")
            return {"error": f"Error extracting images: {e}"}


class PDFExtractor(BaseExtractor):
    """
    Extracts text and images from PDF documents.
    """
    def __init__(self, folder_id, vision_llm=None):
        super().__init__(folder_id)
        self.vision_llm = vision_llm
    
    def extract_text(self, file_path):
        """
        Extracts text content from a PDF file.
        """
        try:
            file_data = self.get_file_data(file_path)
            pdf_stream = BytesIO(file_data)
            text_content = []
            with pdfplumber.open(pdf_stream) as pdf:
                for page in pdf.pages:
                    extracted = page.extract_text()
                    if extracted:
                        text_content.append(extracted)
            return "\n".join(text_content)
        except Exception as e:
            logger.error(f"Error reading PDF {file_path}: {e}")
            return f"Error reading PDF: {e}"
    
    def extract_images(self, file_path, output_folder_id):
        """
        Extracts images from a PDF file and uses vision LLM to extract text.
        """
        try:
            file_data = self.get_file_data(file_path)
            pdf_stream = BytesIO(file_data)
            file_name = os.path.basename(file_path)
            base_name = os.path.splitext(file_name)[0]
            
            # Get the output folder for images
            image_folder = dataiku.Folder(output_folder_id)
            
            # Dictionary to store image paths and extracted text
            image_results = {}
            
            with pdfplumber.open(pdf_stream) as pdf:
                for i, page in enumerate(pdf.pages):
                    # Extract images from the page
                    image_objects = page.images
                    for j, img in enumerate(image_objects):
                        # Extract image data
                        image_data = img["stream"].get_data()
                        image_path = f"{base_name}_page{i+1}_image{j+1}.png"
                        
                        # Write image to output folder
                        with image_folder.get_writer(image_path) as writer:
                            writer.write(image_data)
                        
                        # Process with vision LLM if available
                        extracted_text = ""
                        if self.vision_llm:
                            extracted_text = self.vision_llm.extract_text_from_image(image_data)
                        
                        # Store result
                        image_results[image_path] = extracted_text
            
            return image_results
        except Exception as e:
            logger.error(f"Error extracting images from PDF {file_path}: {e}")
            return {"error": f"Error extracting images from PDF: {e}"}


class PowerPointExtractor(BaseExtractor):
    """
    Extracts text and images from PowerPoint presentations.
    """
    def __init__(self, folder_id, vision_llm=None):
        super().__init__(folder_id)
        self.vision_llm = vision_llm
    
    def extract_text(self, file_path):
        """
        Extracts text from each slide in a PowerPoint file.
        """
        try:
            file_data = self.get_file_data(file_path)
            ppt_stream = BytesIO(file_data)
            prs = Presentation(ppt_stream)
            text_content = []
            for slide in prs.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text:
                        text_content.append(shape.text.strip())
            return "\n".join(text_content)
        except Exception as e:
            logger.error(f"Error reading PowerPoint {file_path}: {e}")
            return f"Error reading PowerPoint: {e}"
    
    def extract_images(self, file_path, output_folder_id):
        """
        Extracts images from a PowerPoint file and uses vision LLM to extract text.
        """
        try:
            file_data = self.get_file_data(file_path)
            ppt_stream = BytesIO(file_data)
            file_name = os.path.basename(file_path)
            base_name = os.path.splitext(file_name)[0]
            
            # Get the output folder for images
            image_folder = dataiku.Folder(output_folder_id)
            
            # Dictionary to store image paths and extracted text
            image_results = {}
            
            prs = Presentation(ppt_stream)
            for i, slide in enumerate(prs.slides):
                for j, shape in enumerate(slide.shapes):
                    if hasattr(shape, "image"):
                        # Extract image data
                        image_data = shape.image.blob
                        image_path = f"{base_name}_slide{i+1}_image{j+1}.png"
                        
                        # Write image to output folder
                        with image_folder.get_writer(image_path) as writer:
                            writer.write(image_data)
                        
                        # Process with vision LLM if available
                        extracted_text = ""
                        if self.vision_llm:
                            extracted_text = self.vision_llm.extract_text_from_image(image_data)
                        
                        # Store result
                        image_results[image_path] = extracted_text
            
            return image_results
        except Exception as e:
            logger.error(f"Error extracting images from PowerPoint {file_path}: {e}")
            return {"error": f"Error extracting images from PowerPoint: {e}"}


class SpreadsheetExtractor(BaseExtractor):
    """
    Extracts data from Excel and CSV files, including multiple sheets in Excel.
    """
    def extract_data(self, file_path):
        try:
            file_data = self.get_file_data(file_path)
            file_ext = os.path.splitext(file_path)[1].lower()

            if file_ext in ['.xls', '.xlsx']:
                # For Excel files: extract all sheets
                excel_data = pd.read_excel(BytesIO(file_data), sheet_name=None)
                # Convert each sheet to a list of dicts
                return {sheet_name: df.to_dict(orient='records') for sheet_name, df in excel_data.items()}
            elif file_ext == '.csv':
                # For CSV: read as a single sheet
                df = pd.read_csv(BytesIO(file_data))
                return {"Sheet1": df.to_dict(orient='records')}
            else:
                return {"error": f"Unsupported spreadsheet file type: {file_ext}"}
        except Exception as e:
            logger.error(f"Error reading spreadsheet {file_path}: {e}")
            return {"error": f"Error reading spreadsheet: {e}"}


class FileProcessor:
    """
    Processes files in a folder by dispatching them to the appropriate extractor
    based on file extension.
    """
    def __init__(self, folder_id, image_output_folder_id=None, max_workers=10, 
                 llm_model_id="custom:iliad-plugin-conn-prod:Claude_3_5_Sonnet"):
        self.folder_id = folder_id
        self.image_output_folder_id = image_output_folder_id
        self.max_workers = int(max_workers) #or os.cpu_count()
        # Initialize vision LLM extractor
        self.vision_llm = VisionLLMExtractor(llm_model_id)
        logger.info(f"Initialized FileProcessor with {self.max_workers} workers and LLM model {llm_model_id}")

    def process_file(self, file_path):
        """Process a single file and return the extracted data."""
        file_name = os.path.basename(file_path)
        ext = os.path.splitext(file_name)[1].lower()
        result = {"file_name": file_name, "file_path": file_path}
        
        logger.info(f"Processing file: {file_name}")
        
        try:
            if ext == ".docx":
                word_extractor = WordExtractor(self.folder_id, self.vision_llm)
                content = word_extractor.extract_text_and_tables(file_path)
                result["text_content"] = content.get("text", content.get("error", ""))
                result["table_content"] = str(content.get("tables", ""))
                
                if self.image_output_folder_id:
                    images_result = word_extractor.extract_images(file_path, self.image_output_folder_id)
                    # Format the output with image paths and extracted text
                    formatted_image_results = []
                    for img_path, text in images_result.items():
                        formatted_image_results.append(f"Image: {img_path}, Text: {text}")
                    result["images_extracted"] = "\n".join(formatted_image_results)
                else:
                    result["images_extracted"] = ""
                    
            elif ext == ".pdf":
                pdf_extractor = PDFExtractor(self.folder_id, self.vision_llm)
                result["text_content"] = pdf_extractor.extract_text(file_path)
                result["table_content"] = ""
                
                if self.image_output_folder_id:
                    images_result = pdf_extractor.extract_images(file_path, self.image_output_folder_id)
                    # Format the output with image paths and extracted text
                    if isinstance(images_result, dict) and "error" not in images_result:
                        formatted_image_results = []
                        for img_path, text in images_result.items():
                            formatted_image_results.append(f"Image: {img_path}, Text: {text}")
                        result["images_extracted"] = "\n".join(formatted_image_results)
                    else:
                        result["images_extracted"] = str(images_result)
                else:
                    result["images_extracted"] = ""
                    
            elif ext in [".ppt", ".pptx"]:
                ppt_extractor = PowerPointExtractor(self.folder_id, self.vision_llm)
                result["text_content"] = ppt_extractor.extract_text(file_path)
                result["table_content"] = ""
                
                if self.image_output_folder_id:
                    images_result = ppt_extractor.extract_images(file_path, self.image_output_folder_id)
                    # Format the output with image paths and extracted text
                    if isinstance(images_result, dict) and "error" not in images_result:
                        formatted_image_results = []
                        for img_path, text in images_result.items():
                            formatted_image_results.append(f"Image: {img_path}, Text: {text}")
                        result["images_extracted"] = "\n".join(formatted_image_results)
                    else:
                        result["images_extracted"] = str(images_result)
                else:
                    result["images_extracted"] = ""
                
            elif ext in [".xls", ".xlsx", ".csv"]:
                spreadsheet_extractor = SpreadsheetExtractor(self.folder_id)
                data = spreadsheet_extractor.extract_data(file_path)
                if "error" in data:
                    result["text_content"] = data["error"]
                    result["table_content"] = ""
                else:
                    result["text_content"] = ""
                    result["table_content"] = str(data)
                result["images_extracted"] = ""
                
            else:
                result["text_content"] = f"Unsupported file type: {file_name}"
                result["table_content"] = ""
                result["images_extracted"] = ""
            
            logger.info(f"Completed processing file: {file_name}")
            return result
            
        except Exception as e:
            logger.error(f"Error processing file {file_name}: {e}")
            result["text_content"] = f"Error processing file: {e}"
            result["table_content"] = ""
            result["images_extracted"] = ""
            return result

    def process_all_files(self, file_list):
        """
        Processes all files in parallel and returns a Pandas DataFrame.
        
        Args:
            file_list: List of file paths to process
            
        Returns:
            pd.DataFrame: DataFrame containing the extracted data
        """
        logger.info(f"Starting parallel processing of {len(file_list)} files with {self.max_workers} workers")
        extracted_data = []
        
        # Use ThreadPoolExecutor for I/O bound operations
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all file processing tasks
            future_to_file = {executor.submit(self.process_file, file_path): file_path 
                             for file_path in file_list}
            
            # Process results as they complete
            for future in concurrent.futures.as_completed(future_to_file):
                file_path = future_to_file[future]
                try:
                    data = future.result()
                    extracted_data.append(data)
                    logger.info(f"Added results for {os.path.basename(file_path)}")
                except Exception as e:
                    logger.error(f"Exception processing {file_path}: {e}")
                    # Add error information to the results
                    extracted_data.append({
                        "file_name": os.path.basename(file_path),
                        "file_path": file_path,
                        "text_content": f"Error in parallel processing: {e}",
                        "table_content": "",
                        "images_extracted": ""
                    })
        
        logger.info(f"Completed processing all {len(file_list)} files")
        return pd.DataFrame(extracted_data)

In [0]:
def main():
    """
    Main function to run the document digitization pipeline.
    Example usage in a Dataiku recipe.
    """
    # Get input and output datasets from Dataiku
    input_folder_id = "Input" #dataiku.get_custom_variables().get("input_folder", "input_documents")
    output_dataset = "tripadvisor_hotel_reviews_summarized" #dataiku.get_custom_variables().get("output_dataset", "extracted_document_data")
    
    # Get LLM model ID from custom variables or use default
    llm_model_id = "custom:iliad-plugin-conn-prod:Claude_3_5_Sonnet" #dataiku.get_custom_variables().get("llm_model_id", "custom:iliad-plugin-conn-prod:Claude_3_5_Sonnet")
    
    # Configure parallel processing
    max_workers = 10 #int(dataiku.get_custom_variables().get("max_workers", 10))
    
    image_output_folder_id="input_images_extracted_custom"  
    
    input_folder = dataiku.Folder(input_folder_id)
    # List all files in the input folder
    file_list = input_folder.list_paths_in_partition()
    
    # Initialize the document processor
    processor = FileProcessor(
        input_folder_id,
        image_output_folder_id,
        10,
        llm_model_id
        
    )
    
    # Process all documents
    results_df = processor.process_all_files(file_list)
    print(results_df)
    
    # Write results to the output dataset
    output = dataiku.Dataset(output_dataset)
    output.write_with_schema(results_df)
    
    logger.info(f"Document digitization pipeline completed. Processed {len(results_df)} files.")


if __name__ == "__main__":
    main()

# Using unstructured library

In [1]:
import os
import logging
import pandas as pd
from io import BytesIO
from typing import Dict, List, Any, Optional
import traceback

# Configure logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

class ExcelExtractor:
    """
    Extracts content from Excel files (xlsx, xls, xlsm) using pandas and openpyxl.
    Handles large files and multiple sheets effectively.
    """
    
    def __init__(self):
        self.max_rows_per_sheet = 500  # Limit rows to prevent memory issues
        self.max_cols_per_sheet = 20   # Limit columns to prevent memory issues
        
    def extract_excel_content(self, file_data: bytes, file_path: str) -> Dict[str, Any]:
        """
        Extract content from Excel files.
        
        Args:
            file_data: Raw bytes of the Excel file
            file_path: Path to the Excel file (used for logging and metadata)
            
        Returns:
            Dict with extracted content:
                - text: Extracted text representation
                - sheets: List of sheet data
                - metadata: File metadata
        """
        try:
            file_name = os.path.basename(file_path)
            logger.info(f"Processing Excel file: {file_name}")
            
            # Create a BytesIO object from file data
            excel_stream = BytesIO(file_data)
            
            # Read all sheets from the Excel file using openpyxl
            excel_file = pd.ExcelFile(excel_stream, engine='openpyxl')
            sheet_names = excel_file.sheet_names
            
            sheets_data = []
            all_text_content = []
            
            for sheet_name in sheet_names:
                # Read the sheet into a DataFrame
                try:
                    df = pd.read_excel(excel_file, sheet_name=sheet_name, engine='openpyxl')
                    
                    # Handle empty sheets
                    if df.empty:
                        sheets_data.append({
                            "sheet_name": sheet_name,
                            "is_empty": True,
                            "text": f"[Sheet '{sheet_name}' is empty]"
                        })
                        all_text_content.append(f"\n--- Sheet: {sheet_name} ---\n[Empty sheet]")
                        continue
                    
                    # Convert DataFrame to text representation
                    text_representation = self._dataframe_to_text(df, sheet_name)
                    all_text_content.append(text_representation)
                    
                    # Store sheet data
                    sheets_data.append({
                        "sheet_name": sheet_name,
                        "is_empty": False,
                        "row_count": len(df),
                        "column_count": len(df.columns),
                        "columns": df.columns.tolist(),
                        "text": text_representation
                    })
                except Exception as e:
                    logger.error(f"Error processing sheet '{sheet_name}' in {file_name}: {str(e)}")
                    sheets_data.append({
                        "sheet_name": sheet_name,
                        "is_empty": True,
                        "text": f"[Error processing sheet '{sheet_name}': {str(e)}]"
                    })
                    all_text_content.append(f"\n--- Sheet: {sheet_name} ---\n[Error processing sheet: {str(e)}]")
            
            # Create metadata
            metadata = {
                "file_name": file_name,
                "file_path": file_path,
                "file_type": os.path.splitext(file_name)[1].lower(),
                "sheet_count": len(sheet_names),
                "sheet_names": sheet_names
            }
            
            # Combine all content
            full_text_content = "\n\n".join(all_text_content)
            
            result = {
                "text": full_text_content,
                "sheets": sheets_data,
                "metadata": metadata
            }
            
            logger.info(f"Successfully processed Excel file: {file_name}")
            return result
            
        except Exception as e:
            logger.error(f"Error extracting content from Excel file {file_path}: {str(e)}\n{traceback.format_exc()}")
            return {
                "text": "Error extracting Excel content: File could not be processed.",
                "sheets": [],
                "metadata": {"file_name": file_name, "file_path": file_path, "error": str(e)}
            }
    
    def _dataframe_to_text(self, df: pd.DataFrame, sheet_name: str) -> str:
        """
        Convert a DataFrame to a text representation.
        
        Args:
            df: The pandas DataFrame to convert
            sheet_name: Name of the sheet
            
        Returns:
            str: Text representation of the DataFrame
        """
        try:
            rows, cols = df.shape
            text_lines = [f"--- Sheet: {sheet_name} ---"]
            text_lines.append(f"Rows: {rows}, Columns: {cols}")
            text_lines.append("")
            
            max_rows = min(self.max_rows_per_sheet, rows)
            max_cols = min(self.max_cols_per_sheet, cols)
            
            # Show sample data (first `max_rows` rows and `max_cols` columns)
            sample_df = df.iloc[:max_rows, :max_cols].fillna("")
            sample_text = sample_df.to_string(index=False)
            text_lines.append(sample_text)
            
            return "\n".join(text_lines)
            
        except Exception as e:
            logger.error(f"Error converting DataFrame to text for sheet {sheet_name}: {str(e)}")
            return f"[Error converting DataFrame to text: {str(e)}]"

# Handler function for the document processor
def handle_excel_file(file_data: bytes, file_path: str) -> Dict[str, Any]:
    """
    Handler function for Excel files that you can call from your document processor.
    
    Args:
        file_data: Raw bytes of the Excel file
        file_path: Path to the Excel file
        
    Returns:
        Dict with extracted content
    """
    extractor = ExcelExtractor()
    extracted_data = extractor.extract_excel_content(file_data, file_path)
    
    # Convert extracted data to the desired format
    combined_data = {
        "text": extracted_data.get("text", ""),
        "tables": [
            {
                "text": sheet.get("text", ""), 
                "metadata": {"sheet_name": sheet.get("sheet_name", "")}
            }
            for sheet in extracted_data.get("sheets", [])
        ],
        "images": [],
        "metadata": extracted_data.get("metadata", {}),
    }
    
    return combined_data


In [2]:
import os
import logging
from io import BytesIO
import base64
import pandas as pd
import concurrent.futures
from typing import Dict, List, Any, Optional, Tuple

import dataiku
from unstructured.partition.auto import partition
from unstructured.chunking.title import chunk_by_title
from unstructured.staging.base import elements_to_json
from unstructured.partition.xlsx import partition_xlsx



# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class BaseDigitizer:
    """
    Base class for document digitization that handles file reading from a Dataiku Folder.
    """
    def __init__(self, folder_id: str):
        """
        Initialize the digitizer with a Dataiku folder.
        
        Args:
            folder_id: The ID of the Dataiku folder containing the documents
        """
        self.data_source = dataiku.Folder(folder_id)
    
    def get_file_data(self, file_path: str) -> bytes:
        """
        Reads file data from the Dataiku Folder.
        
        Args:
            file_path: Path to the file within the Dataiku folder
            
        Returns:
            bytes: The file content as bytes
        """
        with self.data_source.get_download_stream(file_path) as f:
            return f.read()

class UnstructuredDigitizer(BaseDigitizer):
    """
    Digitizes documents using the Unstructured library to extract content.
    """
    def __init__(self, folder_id: str, llm_model_id: str = "custom:iliad-plugin-conn-prod:Claude_3_5_Sonnet"):
        """
        Initialize the Unstructured digitizer.
        
        Args:
            folder_id: The ID of the Dataiku folder containing the documents
            llm_model_id: The ID of the LLM model to use for image description
        """
        super().__init__(folder_id)
        self.llm_model_id = llm_model_id
        
        # Initialize Dataiku LLM client
        self.client = dataiku.api_client()
        self.project = self.client.get_default_project()
        try:
            self.llm_model = self.project.get_llm(self.llm_model_id)
            logger.info(f"Successfully initialized LLM model: {self.llm_model_id}")
        except Exception as e:
            logger.warning(f"Failed to initialize LLM model: {e}")
            self.llm_model = None
    
    def extract_content(self, file_path: str) -> Dict[str, Any]:
        """
        Extract all content from a document using Unstructured.
        
        Args:
            file_path: Path to the file within the Dataiku folder
            
        Returns:
            Dict with keys:
                - text: Extracted text content
                - tables: Extracted tables
                - images: List of extracted images with descriptions
                - metadata: Document metadata
        """
        try:
            file_data = self.get_file_data(file_path)
            file_name = os.path.basename(file_path)
            file_ext = os.path.splitext(file_name)[1].lower()
            
            # Handle Excel files separately
            if file_ext in ['.xlsx', '.xls']:
                return handle_excel_file(file_data, file_path)
            
            # Create a BytesIO object from the file data
            file_stream = BytesIO(file_data)
            
            # Extract elements using Unstructured
            elements = partition(
                file=file_stream,
                file_filename=file_name,
                strategy="auto",
                include_metadata=True,
                extract_images_in_pdf=True,
                extract_image_block_types=["Image"],
                extract_tables=True
            )
            
            # Process extracted elements
            text_elements = []
            table_elements = []
            image_elements = []
            
            for element in elements:
                element_type = element.category
                
                if element_type == "Table":
                    table_elements.append(element)
                elif element_type == "Image":
                    image_elements.append(element)
                elif element_type in ["Title", "NarrativeText", "Text", "ListItem", "Header"]:
                    text_elements.append(element)
            
            # Process text content
            text_content = "\n".join([element.text for element in text_elements])
            
            # Process tables
            tables = []
            for table_element in table_elements:
                tables.append({
                    "text": table_element.text,
                    "metadata": table_element.metadata.to_dict() if hasattr(table_element, "metadata") else {}
                })
            
            # Process images and get descriptions using Dataiku LLM
            images = []
            for image_element in image_elements:
                image_data = {}
                if hasattr(image_element, "metadata") and hasattr(image_element.metadata, "image_base64"):
                    image_data["image_base64"] = image_element.metadata.image_base64
                    if self.llm_model:
                        image_data["description"] = self._describe_image(image_element.metadata.image_base64)
                    else:
                        image_data["description"] = "Image description not available (LLM model not configured)"
                images.append(image_data)
            
            # Get document metadata
            metadata = {
                "file_name": file_name,
                "file_path": file_path,
                "file_type": file_ext,
                "page_count": len(set([e.metadata.page_number for e in elements if hasattr(e, "metadata") and hasattr(e.metadata, "page_number")])),
            }
            
            # Create structured output
            result = {
                "text": text_content,
                "tables": tables,
                "images": images,
                "metadata": metadata
            }
            
            return result
        except Exception as e:
            logger.error(f"Error extracting content from {file_path}: {e}")
            return {
                "text": f"Error extracting content: {str(e)}",
                "tables": [],
                "images": [],
                "metadata": {"file_name": os.path.basename(file_path), "file_path": file_path, "error": str(e)}
            }
    
    def _describe_image(self, image_base64: str) -> str:
        """
        Generate a description for an image using Dataiku's LLM integration.
        
        Args:
            image_base64: Base64 encoded image
            
        Returns:
            str: Description of the image
        """
        try:
            if not self.llm_model:
                return "Image description not available (LLM model not configured)"
            
            # Create a completion request
            completion = self.llm_model.new_completion()
            mp_message = completion.new_multipart_message()
            
            # Add text instructions and image data
            mp_message.with_text("Please provide a detailed description of this image, including all visible text, elements, and context.")
            mp_message.with_text(f"Here is the image in base64 format:\n{image_base64}")
            mp_message.add()
            
            # Execute the completion request
            logger.info("Executing LLM request for image description...")
            resp = completion.execute()
            
            # Extract response text
            if resp.success and hasattr(resp, "text"):
                return resp.text
            else:
                logger.warning(f"LLM request failed or unexpected response format: {resp}")
                return "Unable to generate image description"
        except Exception as e:
            logger.error(f"Error describing image: {e}")
            return f"Error generating image description: {str(e)}"
    
    def combine_extracted_data(self, extracted_data: Dict[str, Any]) -> str:
        """
        Combine all extracted data into a single text column.
        
        Args:
            extracted_data: Dictionary containing text, tables, and images
            
        Returns:
            str: Combined extracted data as a single text string
        """
        combined = []
        
        # Add metadata
        metadata = extracted_data.get("metadata", {})
        combined.append(f"DOCUMENT METADATA:")
        for key, value in metadata.items():
            combined.append(f"{key}: {value}")
        combined.append("\n")
        
        # Add text content
        text_content = extracted_data.get("text", "")
        if text_content:
            combined.append("TEXT CONTENT:")
            combined.append(text_content)
            combined.append("\n")
        
        # Add tables
        tables = extracted_data.get("tables", [])
        if tables:
            combined.append("TABLE CONTENT:")
            for i, table in enumerate(tables):
                combined.append(f"Table {i+1}:")
                combined.append(table.get("text", ""))
                combined.append("")
            combined.append("\n")
        
        # Add images with descriptions
        images = extracted_data.get("images", [])
        if images:
            combined.append("IMAGE CONTENT:")
            for i, image in enumerate(images):
                combined.append(f"Image {i+1} Description:")
                combined.append(image.get("description", "No description available"))
                combined.append("")
        
        return "\n".join(combined)


class DocumentProcessor:
    """
    Processes documents in a folder by extracting content using the UnstructuredDigitizer.
    """
    def __init__(self, input_folder_id: str, llm_model_id: str = "custom:iliad-plugin-conn-prod:Claude_3_5_Sonnet", 
                 max_workers: int = 1):
        """
        Initialize the document processor.
        
        Args:
            input_folder_id: The ID of the Dataiku folder containing the documents
            llm_model_id: The ID of the LLM model to use for image description
            max_workers: Maximum number of worker threads for parallel processing
        """
        self.input_folder_id = input_folder_id
        self.digitizer = UnstructuredDigitizer(input_folder_id, llm_model_id)
        self.max_workers = max_workers or os.cpu_count()
        logger.info(f"Initialized DocumentProcessor with {self.max_workers} workers")
    
    def process_file(self, file_path: str) -> Dict[str, Any]:
        """
        Process a single document file.
        
        Args:
            file_path: Path to the file within the Dataiku folder
            
        Returns:
            Dict: Processed document data
        """
        try:
            file_name = os.path.basename(file_path)
            logger.info(f"Processing file: {file_name}")
            
            # Extract content using the digitizer
            extracted_data = self.digitizer.extract_content(file_path)
            
            # Combine all extracted data into a single text field
            combined_data = self.digitizer.combine_extracted_data(extracted_data)
            
            # Prepare result
            result = {
                "file_name": file_name,
                "file_path": file_path,
                "extracted_text": str(extracted_data.get("text", {})),
                "extracted_tables": str(extracted_data.get("tables", {})),
                "extracted_images": str(extracted_data.get("images", {})),
                "extracted_data": combined_data,
                "metadata": str(extracted_data.get("metadata", {})),
                "processing_status": "success"
            }
            
            logger.info(f"Completed processing file: {file_name}")
            return result
        except Exception as e:
            logger.error(f"Error processing file {file_path}: {e}")
            return {
                "file_name": os.path.basename(file_path),
                "file_path": file_path,
                "extracted_data": f"Error processing file: {str(e)}",
                "metadata": "{}",
                "processing_status": "error"
            }
    
    def process_all_files(self, file_list: Optional[List[str]] = None) -> pd.DataFrame:
        """
        Process all files in the input folder in parallel.
        
        Args:
            file_list: List of file paths to process. If None, all files in the folder are processed.
            
        Returns:
            pd.DataFrame: DataFrame containing the extracted data
        """
        # If no file list is provided, get all files from the folder
        if file_list is None:
            data_source = dataiku.Folder(self.input_folder_id)
            file_list = data_source.list_paths_in_partition()
        
        logger.info(f"Starting parallel processing of {len(file_list)} files with {self.max_workers} workers")
        processed_data = []
        
        # Use ThreadPoolExecutor for parallel processing
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all file processing tasks
            future_to_file = {executor.submit(self.process_file, file_path): file_path 
                             for file_path in file_list}
            
            # Process results as they complete
            for future in concurrent.futures.as_completed(future_to_file):
                file_path = future_to_file[future]
                try:
                    data = future.result()
                    processed_data.append(data)
                    logger.info(f"Added results for {os.path.basename(file_path)}")
                except Exception as e:
                    logger.error(f"Exception processing {file_path}: {e}")
                    # Add error information to the results
                    processed_data.append({
                        "file_name": os.path.basename(file_path),
                        "file_path": file_path,
                        "extracted_data": f"Error in parallel processing: {str(e)}",
                        "metadata": "{}",
                        "processing_status": "error"
                    })
        
        logger.info(f"Completed processing all {len(file_list)} files")
        return pd.DataFrame(processed_data)




In [0]:
def main():
    """
    Main function to run the document digitization pipeline.
    Example usage in a Dataiku recipe.
    """
    logging.basicConfig(level=logging.INFO)
    # Get input and output datasets from Dataiku
    input_folder = "Input" #dataiku.get_custom_variables().get("input_folder", "input_documents")
    output_dataset = "tripadvisor_hotel_reviews_summarized" #dataiku.get_custom_variables().get("output_dataset", "extracted_document_data")
    
    # Get LLM model ID from custom variables or use default
    llm_model_id = "custom:iliad-plugin-conn-prod:Claude_3_5_Sonnet" #dataiku.get_custom_variables().get("llm_model_id", "custom:iliad-plugin-conn-prod:Claude_3_5_Sonnet")
    
    # Configure parallel processing
    max_workers = 1 #int(dataiku.get_custom_variables().get("max_workers", 10))
    
    # Initialize the document processor
    processor = DocumentProcessor(
        input_folder_id=input_folder,
        llm_model_id=llm_model_id,
        max_workers=max_workers
    )
    
    # Process all documents
    results_df = processor.process_all_files()
    print(results_df)
    # Write results to the output dataset
    output = dataiku.Dataset(output_dataset)
    output.write_with_schema(results_df)
    
    logger.info(f"Document digitization pipeline completed. Processed {len(results_df)} files.")


if __name__ == "__main__":
    main()

2025-04-29 02:07:39,888 - INFO - Successfully initialized LLM model: custom:iliad-plugin-conn-prod:Claude_3_5_Sonnet
2025-04-29 02:07:39,889 - INFO - Initialized DocumentProcessor with 1 workers
2025-04-29 02:07:39,925 - INFO - Starting parallel processing of 71 files with 1 workers
2025-04-29 02:07:39,926 - INFO - Processing file: RC Cover Email - Peds.pdf
  from .autonotebook import tqdm as notebook_tqdm
2025-04-29 02:07:42,832 - INFO - pikepdf C++ to Python logger bridge initialized
2025-04-29 02:07:45,028 - INFO - Reading PDF for file: /tmp/tmppphrnrsa/document.pdf ...
2025-04-29 02:08:14,685 - INFO - Completed processing file: RC Cover Email - Peds.pdf
2025-04-29 02:08:14,686 - INFO - Processing file: Phase 1 Requirements - PSIT Patient Journey AI Model.docx
2025-04-29 02:08:14,686 - INFO - Added results for RC Cover Email - Peds.pdf
2025-04-29 02:08:15,482 - INFO - Completed processing file: Phase 1 Requirements - PSIT Patient Journey AI Model.docx
2025-04-29 02:08:15,483 - INFO 

2025-04-29 02:12:42,162 - INFO - Executing LLM request for image description...
2025-04-29 02:12:42,171 - INFO - Executing LLM request for image description...
2025-04-29 02:12:42,179 - INFO - Executing LLM request for image description...
2025-04-29 02:12:42,188 - INFO - Executing LLM request for image description...
2025-04-29 02:12:42,196 - INFO - Executing LLM request for image description...
2025-04-29 02:12:42,204 - INFO - Executing LLM request for image description...
2025-04-29 02:12:42,212 - INFO - Executing LLM request for image description...
2025-04-29 02:12:42,221 - INFO - Executing LLM request for image description...
2025-04-29 02:12:42,229 - INFO - Executing LLM request for image description...
2025-04-29 02:12:42,237 - INFO - Executing LLM request for image description...
2025-04-29 02:12:42,245 - INFO - Executing LLM request for image description...
2025-04-29 02:12:42,253 - INFO - Executing LLM request for image description...
2025-04-29 02:12:42,261 - INFO - Executi

  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)
2025-04-29 02:16:40,363 - INFO - Successfully processed Excel file: PSIT-Rinvoq-AD-Data Triggers and Personalization.xlsx
2025-04-29 02:16:40,364 - INFO - Completed processing file: PSIT-Rinvoq-AD-Data Triggers and Personalization.xlsx
2025-04-29 02:16:40,364 - INFO - Processing file: Pharmacy Services Call Flows.pdf
2025-04-29 02:16:40,364 - INFO - Added results for PSIT-Rinvoq-AD-Data Triggers and Personalization.xlsx
2025-04-29 02:16:40,637 - INFO - Reading PDF for file: /tmp/tmpc4uf4i23/document.pdf ...
2025-04-29 02:16:51,070 - INFO - Executing LLM request for image description...
2025-04-29 02:16:51,083 - INFO - Executing LLM request for image description...
2025-04-29 02:16:51,093 - INFO - Executing LLM request for image description...
2025-04-29 02:16:51,103 - INFO - Completed processing file: Pharmacy Services Call Flows.pdf
2025-04-29 02:16:51,104 - INFO - Processing file: RC I Can't

2025-04-29 02:19:55,551 - INFO - Executing LLM request for image description...
2025-04-29 02:19:55,560 - INFO - Executing LLM request for image description...
2025-04-29 02:19:55,569 - INFO - Executing LLM request for image description...
2025-04-29 02:19:55,578 - INFO - Executing LLM request for image description...
2025-04-29 02:19:55,587 - INFO - Executing LLM request for image description...
2025-04-29 02:19:55,596 - INFO - Executing LLM request for image description...
2025-04-29 02:19:55,604 - INFO - Executing LLM request for image description...
2025-04-29 02:19:55,612 - INFO - Executing LLM request for image description...
2025-04-29 02:19:55,621 - INFO - Executing LLM request for image description...
2025-04-29 02:19:55,630 - INFO - Executing LLM request for image description...
2025-04-29 02:19:55,639 - INFO - Executing LLM request for image description...
2025-04-29 02:19:55,647 - INFO - Executing LLM request for image description...
2025-04-29 02:19:55,656 - INFO - Executi

2025-04-29 02:22:17,865 - INFO - Processing file: Pharmacy Solutions Delivery Text Messages.xlsx
2025-04-29 02:22:17,865 - INFO - Added results for RComplete_Kaiser Rebate Brochure_Print_Savings Card T&C Update (1).pdf
2025-04-29 02:22:17,934 - INFO - Processing Excel file: Pharmacy Solutions Delivery Text Messages.xlsx
  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)
2025-04-29 02:22:17,951 - INFO - Successfully processed Excel file: Pharmacy Solutions Delivery Text Messages.xlsx
2025-04-29 02:22:17,951 - INFO - Completed processing file: Pharmacy Solutions Delivery Text Messages.xlsx
2025-04-29 02:22:17,952 - INFO - Processing file: RINVOQ Complete SMS Wave 1 Doctor Follow-Up Use Case.pdf
2025-04-29 02:22:17,952 - INFO - Added results for Pharmacy Solutions Delivery Text Messages.xlsx
2025-04-29 02:22:18,420 - INFO - Reading PDF for file: /tmp/tmpjrn0oz1r/document.pdf ...
2025-04-29 02:22:28,180 - INFO - Executing LLM request for image des

2025-04-29 02:25:47,187 - INFO - Executing LLM request for image description...
2025-04-29 02:25:47,196 - INFO - Executing LLM request for image description...
2025-04-29 02:25:47,206 - INFO - Executing LLM request for image description...
2025-04-29 02:25:47,214 - INFO - Executing LLM request for image description...
2025-04-29 02:25:47,223 - INFO - Executing LLM request for image description...
2025-04-29 02:25:47,232 - INFO - Completed processing file: Rcomplete_App Leave Behind_Print_Savings Card T&C Update.pdf
2025-04-29 02:25:47,233 - INFO - Processing file: ARS RC Intro to ARS Email (from ARS).pdf
2025-04-29 02:25:47,233 - INFO - Added results for Rcomplete_App Leave Behind_Print_Savings Card T&C Update.pdf
2025-04-29 02:25:48,731 - INFO - Reading PDF for file: /tmp/tmpmcaiaqre/document.pdf ...
2025-04-29 02:26:24,019 - INFO - Executing LLM request for image description...
2025-04-29 02:26:24,032 - INFO - Executing LLM request for image description...
2025-04-29 02:26:24,043 - I

#using hugging face

In [0]:
input_folder = dataiku.Folder("Input")
paths = input_folder.list_paths_in_partition()
for path in paths[1:2]:
    print(path)
    with input_folder.get_download_stream(path) as f:
        data = f.read()
        print(data)