In [4]:
# Step 1: Install OpenAI SDK
#!pip install --upgrade openai

# Step 2: Import required libraries
from google.colab import files
from getpass import getpass
import openai
import base64

# Step 3: Define the OCR + TTS function
def run_ocr_to_audio():
    # Securely get OpenAI API Key
    openai.api_key = getpass("🔐 Enter your OpenAI API Key: ")

    # Upload the image
    print("📤 Upload an image file (JPG or PNG)...")
    uploaded = files.upload()
    image_path = list(uploaded.keys())[0]

    # Encode image to base64
    with open(image_path, "rb") as img_file:
        image_bytes = img_file.read()
        encoded_image = base64.b64encode(image_bytes).decode("utf-8")

    # Estimate total visible words in image using GPT-4o
    print("🔎 Estimating total visible words in the image...")
    word_estimate_response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a visual inspector. Estimate the total number of readable words in this image."},
            {"role": "user", "content": [
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}
            ]}
        ],
        max_tokens=50
    )
    estimated_word_count_text = word_estimate_response.choices[0].message.content.strip()
    try:
        total_words_estimated = int(''.join(filter(str.isdigit, estimated_word_count_text)))
    except:
        total_words_estimated = "Unknown"

    # Extract text using GPT-4o
    print("📝 Extracting text from the image using GPT-4o...")
    extract_response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an OCR expert. Extract all readable text from the image."},
            {"role": "user", "content": [
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}
            ]}
        ],
        max_tokens=2048
    )
    extracted_text = extract_response.choices[0].message.content.strip()
    extracted_word_count = len(extracted_text.split())

    # Display result
    print("\n🧾 OCR Result:")
    print("----------------------------------------------------")
    print(extracted_text)
    print("----------------------------------------------------")
    print(f"🔢 Estimated Words in Image: {total_words_estimated}")
    print(f"✅ Words Extracted by GPT-4o: {extracted_word_count}")

    # Convert extracted text to audio using TTS-1 HD
    print("🎙️ Converting extracted text to speech...")
    audio_response = openai.audio.speech.create(
        model="tts-1-hd",
        voice="nova",
        input=extracted_text
    )

    # Save the audio output
    output_audio_file = "output_audio.mp3"
    with open(output_audio_file, "wb") as f:
        f.write(audio_response.content)

    print("✅ Audio saved as:", output_audio_file)
    files.download(output_audio_file)

# Step 4: Call the function
run_ocr_to_audio()


🔐 Enter your OpenAI API Key: ··········
📤 Upload an image file (JPG or PNG)...


Saving book_Page2.jpg to book_Page2.jpg
🔎 Estimating total visible words in the image...
📝 Extracting text from the image using GPT-4o...

🧾 OCR Result:
----------------------------------------------------
understand why thousands of ambitious people hadn't already discovered it. It was this: I would study how the famous actors of that day—John Drew, Walter Hampden, and Otis Skinner—got their effects. Then I would imitate the best point of each one of them and make myself into a shining, triumphant combination of all of them. How silly! How absurd! I had to waste years of my life imitating other people before it penetrated through my thick Missouri skull that I had to be myself, and that I couldn't possibly be anyone else.

That distressing experience ought to have taught me a lasting lesson. But it didn't. Not me. I was too dumb. I had to learn it all over again. Several years later, I set out to write what I hoped would be the best book on public speaking for business men that had ev

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>