In [0]:
%pylab inline

In [0]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd

In [0]:
# Example: load a DSS dataset as a Pandas dataframe
mydataset = dataiku.Dataset("mydataset")
mydataset_df = mydataset.get_dataframe()

In [1]:
import os
import pandas as pd
import dataiku
from PIL import Image
import pytesseract
import re

class ImageTextExtractor:
    """
    A class to extract text from images stored in a Dataiku managed folder
    and write the results to a Dataiku dataset using a DataFrame.
    Only images with extracted text containing more than 5 words are included.
    """
    
    def __init__(self, input_folder_name, output_dataset_name):
        """
        Initialize the extractor with input folder and output dataset names.
        
        Args:
            input_folder_name: Name of the Dataiku managed folder containing images
            output_dataset_name: Name of the Dataiku dataset to write results to
        """
        self.input_folder = dataiku.Folder(input_folder_name)
        self.output_dataset = dataiku.Dataset(output_dataset_name)
        
    def extract_text_from_image(self, image_path):
        """
        Extract text from a single image using pytesseract OCR.
        
        Args:
            image_path: Path to the image file
            
        Returns:
            Extracted text as string with cleaned whitespace
        """
        try:
            image = Image.open(image_path)
            text = pytesseract.image_to_string(image)
            
            # Clean up the extracted text
            # Replace multiple whitespaces with a single space
            cleaned_text = re.sub(r'\s+', ' ', text)
            # Remove leading and trailing whitespace
            cleaned_text = cleaned_text.strip()
            
            return cleaned_text
        except Exception as e:
            print(f"Error extracting text from {image_path}: {str(e)}")
            return ""
    
    def process_all_images(self):
        """
        Process all images in the input folder and create a DataFrame with the results.
        Only include images with more than 5 words in the extracted text.
        Then write the DataFrame to the output dataset.
        """
        # Get list of files in the managed folder
        file_list = self.input_folder.list_paths_in_partition()
        
        # Prepare data for DataFrame
        image_names = []
        extracted_texts = []
        
        # Track statistics
        total_images = 0
        included_images = 0
        
        # Process each image file
        for file_path in file_list:
            if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
                total_images += 1
                # Get the image name from the path
                image_name = os.path.basename(file_path)
                
                # Get the full path for the file in the managed folder
                with self.input_folder.get_download_stream(file_path) as stream:
                    # Save temporarily
                    temp_path = f"/tmp/{image_name}"
                    with open(temp_path, 'wb') as f:
                        f.write(stream.read())
                    
                    # Extract text
                    extracted_text = self.extract_text_from_image(temp_path)
                    
                    # Clean up
                    os.remove(temp_path)
                
                # Check if extracted text has more than 5 words
                word_count = len(extracted_text.split())
                if word_count > 5:
                    # Append to lists
                    image_names.append(image_name)
                    extracted_texts.append(extracted_text)
                    included_images += 1
                    print(f"Processed {image_name} - {word_count} words - INCLUDED")
                else:
                    print(f"Processed {image_name} - {word_count} words - SKIPPED (less than 5 words)")
        
        # Create DataFrame
        results_df = pd.DataFrame({
            "image_name": image_names,
            "extracted_text": extracted_texts,
            "word_count": [len(text.split()) for text in extracted_texts]
        })
        
        # Write DataFrame to output dataset
        self.output_dataset.write_with_schema(results_df)
        
        print(f"Total images processed: {total_images}")
        print(f"Images included in dataset: {included_images}")
        print(f"Images excluded (less than 5 words): {total_images - included_images}")
        
        return results_df

# Example usage:
def main():
    # Replace with your actual folder and dataset names
    extractor = ImageTextExtractor(
        input_folder_name="input_images_extracted_custom",
        output_dataset_name="images_data_extracted"
    )
    
    df = extractor.process_all_images()
    print(f"Text extraction completed! Added {len(df)} images to dataset.")

if __name__ == "__main__":
    main()

Processed HCP Affordability Bridge Eligible Letter_B2H_image1.jpeg - 191 words - INCLUDED
Processed HCP Affordability Bridge Eligible Letter_B2H_image2.png - 2 words - SKIPPED (less than 5 words)
Processed HCP Affordability Bridge Eligible Letter_B2H_image3.png - 0 words - SKIPPED (less than 5 words)
Processed Phase 1 Requirements - PSIT Patient Journey AI Model_image1.jpeg - 101 words - INCLUDED
Processed Phase 1 Requirements - PSIT Patient Journey AI Model_image2.png - 28 words - INCLUDED
Processed Phase 1 Requirements - PSIT Patient Journey AI Model_image3.png - 50 words - INCLUDED
Processed Phase 1 Requirements - PSIT Patient Journey AI Model_image4.png - 85 words - INCLUDED
Processed Phase 1 Requirements - PSIT Patient Journey AI Model_image5.png - 48 words - INCLUDED
Processed AMB RC Intro to ARS Email_page1_image1.png - 2451 words - INCLUDED
Processed ARTS RC I Can't Reach You Email_page1_image1.png - 2138 words - INCLUDED
Processed ARS RC Sorry I Missed You Email_page1_image1.p

Processed 2025-02-26-Disruption Workshop_slide7_image6.png - 0 words - SKIPPED (less than 5 words)
Processed 2025-01-21 Bridge Program Workshop_slide7_image1.png - 0 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/Rcomplete_App Leave Behind_Print_Savings Card T&C Update_page1_image5.png: cannot identify image file '/tmp/Rcomplete_App Leave Behind_Print_Savings Card T&C Update_page1_image5.png'
Processed Rcomplete_App Leave Behind_Print_Savings Card T&C Update_page1_image5.png - 0 words - SKIPPED (less than 5 words)
Processed Bridge Status field in iEngage-Quick Reference Guide_slide1_image36.png - 0 words - SKIPPED (less than 5 words)
Processed Draft_PSIT_Patient_Journey_AI_Model_slide14_image2.png - 0 words - SKIPPED (less than 5 words)
Processed 2025-01-21 Bridge Program Workshop_slide6_image29.png - 9 words - INCLUDED
Processed 2025-02-26-Disruption Workshop_slide8_image3.png - 0 words - SKIPPED (less than 5 words)
Processed Bridge Status field in iEngage-Quick R

Processed Screenshot 2025-04-15 170036.png - 130 words - INCLUDED
Processed 2025-02-26-Disruption Workshop_slide19_image3.png - 394 words - INCLUDED
Error extracting text from /tmp/Bridge Workshop Deck_page3_image6.png: cannot identify image file '/tmp/Bridge Workshop Deck_page3_image6.png'
Processed Bridge Workshop Deck_page3_image6.png - 0 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/RC My RINVOQ Routine Print - CD LC Update_page1_image3.png: cannot identify image file '/tmp/RC My RINVOQ Routine Print - CD LC Update_page1_image3.png'
Processed RC My RINVOQ Routine Print - CD LC Update_page1_image3.png - 0 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/RC My RINVOQ Routine Print - CD LC Update_page1_image4.png: cannot identify image file '/tmp/RC My RINVOQ Routine Print - CD LC Update_page1_image4.png'
Processed RC My RINVOQ Routine Print - CD LC Update_page1_image4.png - 0 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/

Processed 2024-12-2 Bridge Workshop Agenda and Insights_slide7_image1.png - 776 words - INCLUDED
Error extracting text from /tmp/Rinvoq AD PE Tactics Business Design Executive Summaries_slide4_image3.png: Unsupported image format/type
Processed Rinvoq AD PE Tactics Business Design Executive Summaries_slide4_image3.png - 0 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/RC My RINVOQ Routine Print - CD LC Update_page1_image9.png: cannot identify image file '/tmp/RC My RINVOQ Routine Print - CD LC Update_page1_image9.png'
Processed RC My RINVOQ Routine Print - CD LC Update_page1_image9.png - 0 words - SKIPPED (less than 5 words)
Processed RIN AD Patient Engagement System Use Case Overview_page1_image2.png - 0 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/RC My RINVOQ Routine Print - CD LC Update_page1_image10.png: cannot identify image file '/tmp/RC My RINVOQ Routine Print - CD LC Update_page1_image10.png'
Processed RC My RINVOQ Routine Print - CD

Processed Rinvoq AD PE Tactics Business Design Executive Summaries_slide9_image17.png - 7 words - INCLUDED
Processed Pre Fill PA Summary 1_slide2_image3.png - 1 words - SKIPPED (less than 5 words)
Processed 5_6_FAS CSS IS  Overview_Lunch & Learn_slide18_image3.png - 0 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/RC My RINVOQ Routine Print - CD LC Update_page2_image5.png: cannot identify image file '/tmp/RC My RINVOQ Routine Print - CD LC Update_page2_image5.png'
Processed RC My RINVOQ Routine Print - CD LC Update_page2_image5.png - 0 words - SKIPPED (less than 5 words)
Processed RIN AD Patient Engagement System Use Case Overview_page1_image4.png - 0 words - SKIPPED (less than 5 words)
Processed Rinvoq AD PA Denied Discovery_slide2_image7.png - 0 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/Rinvoq AD PA Denied Discovery_slide4_image3.png: Unsupported image format/type
Processed Rinvoq AD PA Denied Discovery_slide4_image3.png - 0 words - SKIP

Processed RIN AD Patient Engagement System Use Case Overview_slide3_image24.png - 9 words - INCLUDED
Processed RIN AD Patient Engagement System Use Case Overview_slide5_image5.png - 1 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/RComplete_Kaiser Rebate Brochure_Print_Savings Card T&C Update (1)_page4_image3.png: cannot identify image file '/tmp/RComplete_Kaiser Rebate Brochure_Print_Savings Card T&C Update (1)_page4_image3.png'
Processed RComplete_Kaiser Rebate Brochure_Print_Savings Card T&C Update (1)_page4_image3.png - 0 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/RComplete_Kaiser Rebate Brochure_Print_Savings Card T&C Update (1)_page4_image4.png: cannot identify image file '/tmp/RComplete_Kaiser Rebate Brochure_Print_Savings Card T&C Update (1)_page4_image4.png'
Processed RComplete_Kaiser Rebate Brochure_Print_Savings Card T&C Update (1)_page4_image4.png - 0 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/RC My RINV

Processed 2024-12-2 Bridge Workshop Agenda and Insights_slide12_image9.png - 0 words - SKIPPED (less than 5 words)
Processed 2024-12-2 Bridge Workshop Agenda and Insights_slide13_image33.png - 0 words - SKIPPED (less than 5 words)
Processed 2024-12-2 Bridge Workshop Agenda and Insights_slide13_image34.png - 0 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/Rcomplete_Ambassador Services Brochure_Print_Savings Card T&C Update_page3_image2.png: cannot identify image file '/tmp/Rcomplete_Ambassador Services Brochure_Print_Savings Card T&C Update_page3_image2.png'
Processed Rcomplete_Ambassador Services Brochure_Print_Savings Card T&C Update_page3_image2.png - 0 words - SKIPPED (less than 5 words)
Processed RIN AD Patient Engagement System Use Case Overview_page4_image3.png - 0 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/2024-12-2 Bridge Workshop Agenda and Insights_slide17_image37.png: Unsupported image format/type
Processed 2024-12-2 Bridge Work

Processed 2024-12-2 Bridge Workshop Agenda and Insights_slide30_image15.png - 1 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/Rcomplete_Ambassador Services Brochure_Print_Savings Card T&C Update_page3_image7.png: cannot identify image file '/tmp/Rcomplete_Ambassador Services Brochure_Print_Savings Card T&C Update_page3_image7.png'
Processed Rcomplete_Ambassador Services Brochure_Print_Savings Card T&C Update_page3_image7.png - 0 words - SKIPPED (less than 5 words)
Processed RIN AD Patient Engagement System Use Case Overview_page4_image6.png - 0 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/Rcomplete_Ambassador Services Brochure_Print_Savings Card T&C Update_page3_image8.png: cannot identify image file '/tmp/Rcomplete_Ambassador Services Brochure_Print_Savings Card T&C Update_page3_image8.png'
Processed Rcomplete_Ambassador Services Brochure_Print_Savings Card T&C Update_page3_image8.png - 0 words - SKIPPED (less than 5 words)
Error extracting

Processed RIN AD Patient Engagement System Use Case Overview_page4_image14.png - 0 words - SKIPPED (less than 5 words)
Error extracting text from /tmp/RComplete_Intro to AD Adol Brochure_Print_Savings Card T&C Update (1)_page8_image1.png: cannot identify image file '/tmp/RComplete_Intro to AD Adol Brochure_Print_Savings Card T&C Update (1)_page8_image1.png'
Processed RComplete_Intro to AD Adol Brochure_Print_Savings Card T&C Update (1)_page8_image1.png - 0 words - SKIPPED (less than 5 words)
Processed RIN AD Patient Engagement System Use Case Overview_page4_image15.png - 0 words - SKIPPED (less than 5 words)
Processed RIN AD Bridge Program Optimization Starter Document_slide21_image3.png - 232 words - INCLUDED
Processed RIN AD Patient Engagement System Use Case Overview_page4_image16.png - 0 words - SKIPPED (less than 5 words)
Processed Rcomplete_Ambassador Services Brochure_Print_Savings Card T&C Update_page5_image1.png - 0 words - SKIPPED (less than 5 words)
Processed Rcomplete_Ambas