In [4]:
import kagglehub

path = kagglehub.dataset_download("shubham1921/real-to-ghibli-image-dataset-5k-paired-images")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shubham1921/real-to-ghibli-image-dataset-5k-paired-images?dataset_version_number=1...


100%|██████████| 543M/543M [00:05<00:00, 104MB/s]  

Extracting files...





Path to dataset files: /home/ubuntu/.cache/kagglehub/datasets/shubham1921/real-to-ghibli-image-dataset-5k-paired-images/versions/1


In [None]:
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
import shutil
import torch
from vllm import LLM, SamplingParams

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct"
llm = LLM(
    model=MODEL_ID,
    trust_remote_code=True,
    dtype="bfloat16" if torch.cuda.is_bf16_supported() else "float16",
    max_model_len=4096,
    limit_mm_per_prompt={"image": 1},
)

input_image_dir = "./dataset/images"
output_description_dir = "ghibli_style_descriptions"
os.makedirs(output_description_dir, exist_ok=True)

image_extensions = (".jpg", ".jpeg", ".png", ".bmp", ".gif")
files = [
    os.path.join(input_image_dir, f)
    for f in os.listdir(input_image_dir)
    if f.lower().endswith(image_extensions)
]

if not files:
    print(f"No images found in {input_image_dir}.")
    exit()

print(f"Found {len(files)} images.")

BATCH_SIZE = 8
batches = np.array_split(files, max(1, len(files) // BATCH_SIZE))

sampling_params = SamplingParams(temperature=0.7, top_p=0.9, max_tokens=128)

for batch_files in tqdm(batches, desc="Processing batches"):
    requests = []

    for image_path in batch_files:
        image = Image.open(image_path).convert("RGB")

        prompt = "User: <image>\nPlease provide a detailed description of the image including its style and the way it is drawn. Its drawing style is ghibli style. So give description in accordance with that.\nAssistant:"

        requests.append({"prompt": prompt, "multi_modal_data": {"image": image}})

    outputs = llm.generate(requests, sampling_params)

    for i, output in enumerate(outputs):
        image_path = batch_files[i]
        description = output.outputs[0].text.strip()

        description_with_token = "<ghibli-style> " + description

        base_filename = os.path.splitext(os.path.basename(image_path))[0]
        output_txt_path = os.path.join(output_description_dir, base_filename + ".txt")

        with open(output_txt_path, "w", encoding="utf-8") as f:
            f.write(description_with_token)
            
        output_image_path = os.path.join(
            output_description_dir, os.path.basename(image_path)
        )
        shutil.copy(image_path, output_image_path)

print("Done.")

In [9]:
!wget -q https://raw.githubusercontent.com/huggingface/diffusers/main/examples/text_to_image/train_text_to_image_lora.py

In [None]:
import os
import json

dataset_folder = (
    "./ghibli_style_descriptions"  
)
metadata_file = os.path.join(dataset_folder, "metadata.jsonl")
style_token = (
    "<ghibli-style>"  
)

data_entries = []
for filename in os.listdir(dataset_folder):
    if filename.lower().endswith((".jpg", ".jpeg", ".png")):
        image_name = filename
        text_name = os.path.splitext(filename)[0] + ".txt"
        text_path = os.path.join(dataset_folder, text_name)
        if os.path.exists(text_path):
            try:
                with open(text_path, "r", encoding="utf-8") as f:
                    caption = f.read().strip()

                entry = {"file_name": image_name, "text": caption}
                data_entries.append(entry)

            except Exception as e:
                print(f"Error processing text file {text_name}: {e}")
        else:
            print(f"Warning: No matching text file found for {image_name}. Skipping.")

with open(metadata_file, "w", encoding="utf-8") as f:
    for entry in data_entries:
        f.write(json.dumps(entry) + "\n")

print(f"Created metadata.jsonl with {len(data_entries)} entries at {metadata_file}")

Created metadata.jsonl with 2500 entries at ./ghibli_style_descriptions/metadata.jsonl


In [20]:
!accelerate launch train_text_to_image_lora.py \
  --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
  --train_data_dir="/home/ubuntu/oracle/Research-Papers-Implementation/diffusion_models_learn/basics/ghibli_style_descriptions" \
  --caption_column="text" \
  --resolution=512 \
  --center_crop \
  --random_flip \
  --train_batch_size=4 \
  --gradient_accumulation_steps=1 \
  --max_train_steps=2500 \
  --learning_rate=1e-4 \
  --max_grad_norm=1.0 \
  --lr_scheduler="cosine" \
  --output_dir="lora-ghibli-finetuned" \
  --seed=42 \
  --mixed_precision="bf16" \
  --report_to="wandb" \
  --checkpointing_steps=500 \
  --rank=64 \
  --validation_prompt="a landscape with trees and hills, <ghibli-style>"


[2025-05-07 19:08:30,632] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Multiple distributions found for package optimum. Picked distribution: optimum-quanto
05/07/2025 19:08:33 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda

Mixed precision type: bf16

{'prediction_type', 'variance_type', 'clip_sample_range', 'dynamic_thresholding_ratio', 'timestep_spacing', 'rescale_betas_zero_snr', 'thresholding', 'sample_max_value'} was not found in config. Values will be initialized to default values.
{'use_post_quant_conv', 'use_quant_conv', 'latents_std', 'shift_factor', 'force_upcast', 'scaling_factor', 'latents_mean', 'mid_block_add_attention'} was not found in config. Values will be initialized to default values.
All model checkpoint weights were used when initializing AutoencoderKL.

All the weights of AutoencoderKL were initialized from the model checkpoint at runwayml/stab

In [21]:
import torch
from diffusers import StableDiffusionImg2ImgPipeline
from PIL import Image
import os

base_model_path = "runwayml/stable-diffusion-v1-5"

lora_weights_path = "./lora-ghibli-finetuned"

input_image = "test.jpg"

output_image_path = "ghibli-test.jpg"
style_token = "<ghibli-style>"

prompt = "A women with a horse and other animals"

conversion_strength = 0.8
inference_steps = 50
guidance_scale = 7.5
model_input_resolution = 512

pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
).to("cuda")

pipe.load_lora_weights(lora_weights_path)

image = Image.open(input_image).convert("RGB")
image = image.resize((model_input_resolution, model_input_resolution))

prompt = f"{prompt}, {style_token}"

output_image = pipe(
    prompt=prompt,
    image=image,
    strength=conversion_strength,
    guidance_scale=guidance_scale,
    num_inference_steps=inference_steps,
).images[0]

output_image.save(output_image_path)


[2025-05-07 19:47:41,468] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/envs/idm/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/envs/idm/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'
/opt/conda/envs/idm/compiler_compat/ld: /usr/local/lib/libstdc++.so.6: undefined reference to `fesetround@GLIBC_2.2.5'
/opt/conda/envs/idm/compiler_compat/ld: /usr/local/lib/libstdc++.so.6: undefined reference to `fegetround@GLIBC_2.2.5'
collect2: error: ld returned 1 exit status


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]



  0%|          | 0/40 [00:00<?, ?it/s]