# Insta Caption Generator

Load your HF API key and relevant Python libraries

In [1]:
import os
import io
from IPython.display import Image, display, HTML
from PIL import Image
import base64 

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
hf_api_key = os.environ['HF_API_KEY']

In [2]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration


  from .autonotebook import tqdm as notebook_tqdm


#### Downloading the model for the first time

In [7]:
# from transformers import BlipProcessor, BlipForConditionalGeneration

# # Specify your local directory where you want to save the model and tokenizer
# save_directory = "./blip_image_captioning_large"

# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

# # Save the model and tokenizer to the directory
# processor.save_pretrained(save_directory)
# model.save_pretrained(save_directory)


#### Loading the model for the first time

In [3]:
from transformers import BlipProcessor, BlipForConditionalGeneration

# Specify your local directory where you've saved the model and tokenizer
load_directory = "./blip_image_captioning_large"

processor = BlipProcessor.from_pretrained(load_directory)
model = BlipForConditionalGeneration.from_pretrained(load_directory).to("cuda")


### Inference

In [4]:
# Image from the web

img_url = 'https://cdn.omahaschoolofmusicanddance.com/wp-content/uploads/2020/01/21095647/playing-piano-blog.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

In [5]:
raw_image = Image.open('/home/thor_01/Downloads/IMG_6063.jpg').convert('RGB')

In [12]:
# conditional image captioning
text = "a photography of"
inputs = processor(raw_image, text, return_tensors="pt").to("cuda")

out = model.generate(**inputs, min_length=15, max_length=100, do_sample=False, repetition_penalty=1.5)
output_text = processor.decode(out[0], skip_special_tokens=True)
print(output_text)


# # unconditional image captioning
# inputs = processor(raw_image, return_tensors="pt").to("cuda")

# out = model.generate(**inputs, min_length=5, max_length=100, do_sample=False, repetition_penalty=1.5)
# output_text = processor.decode(out[0], skip_special_tokens=True)
# print(output_text)

a photography of an empty street with buildings and trees in the background,


there is a street with buildings and trees on both sides


# Loading LLM Model

In [9]:
from ctransformers import AutoModelForCausalLM


In [10]:
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained("/home/thor_01/Documents/05_AI_app_Gradio/llama-2-13b-chat.Q5_K_M.gguf", model_file="llama-2-13b-chat.q4_K_M.gguf", model_type="llama", gpu_layers=50)


### Creating the prompt template

In [13]:

prompt = f"""
Create 3 alternatives for a philosofical caption of maximum 80 tokens for an instagram post based on the following photo description, add 5 relevant hastags to each option:
'{output_text}'
"""
response = llm(prompt)
print(response)

option one
'life is full of paths, we choose which ones to follow, some lead to happiness while others to loneliness.' #philosophy #emptystreet #lifelessons 
option two
'the world may seem empty, but our imagination can fill it with endless possibilities' #creativity #imagination #emptyworld
option three
'sometimes the most beautiful things are found in the quietest places' #minimalism # simplicity #emptystreet


### Adding interface

In [15]:
def get_answer(raw_image, style):
    # 1. conditional image captioning
    text = "a photography of"
    inputs = processor(raw_image, text, return_tensors="pt").to("cuda")

    out = model.generate(**inputs, min_length=15, max_length=100, do_sample=False, repetition_penalty=1.5)
    output_text = processor.decode(out[0], skip_special_tokens=True)
    # 2. CREATING PROMPT
    style = 'philosofical'

    prompt = f"""
    Create 3 alternatives for a {style} caption of maximum 80 tokens for an instagram post based on the following photo description, add 5 relevant hastags to each option:
    '{output_text}'
    """
    response = llm(prompt)
    return response


In [17]:
import gradio as gr

    
gr.close_all()
demo = gr.Interface(fn=get_answer, 
                    inputs=[gr.Image(label="Image to get capture")],
                    outputs=[gr.Textbox(label="Result", lines=5)],
                    title="Image to Poetic Caption",
                    description="Get poetic caption for any image"
                   )
demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [19]:
import gradio as gr

# Define the function for generating an answer based on the given raw image and style.
def get_answer(raw_image, style):
    # 1. conditional image captioning
    text = "a photography of"
    inputs = processor(raw_image, text, return_tensors="pt").to("cuda")

    out = model.generate(**inputs, min_length=15, max_length=100, do_sample=False, repetition_penalty=1.5)
    output_text = processor.decode(out[0], skip_special_tokens=True)
    
    # 2. CREATING PROMPT
    # Note: No need to hardcode style as 'philosofical' as it's an input now.
    prompt = f"""
    Create 3 alternatives for a {style} caption of maximum 80 tokens for an Instagram post based on the following photo description, add 5 relevant hashtags to each option:
    '{output_text}'
    """
    response = llm(prompt)
    return response

# Close any existing Gradio interfaces.
gr.close_all()

# Define the Gradio interface.
demo = gr.Interface(
    fn=get_answer, 
    inputs=[
        gr.Image(label="Image to get capture"),
        gr.Dropdown(choices=["philosophical", "humorous", "sarcastic", "romantic", "adventurous", "mystical", "nostalgic", "inspirational", "whimsical"], label="Caption Style")
    ],
    outputs=[gr.Textbox(label="Result", lines=5)],
    title="Image to Poetic Caption",
    description="Get poetic caption for any image"
)

# Launch the interface.
demo.launch()

Closing server running on port: 7860
Closing server running on port: 7860
Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


