In [1]:
import pandas as pd
from diffusers import DiffusionPipeline, UNet2DConditionModel, DDPMScheduler
from transformers import CLIPTextModel, CLIPTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from tqdm import tqdm

# Step 1: Load your dataset (assumes the CSV has a column named 'prompt')
class PromptDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)
        self.prompts = self.data['Prompt']

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        return self.prompts.iloc[idx]

# Load your prompt dataset
dataset = PromptDataset("Book12.csv")
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Step 2: Load the diffusion model components
pipeline = DiffusionPipeline.from_pretrained("prompthero/openjourney")
unet = pipeline.unet
text_encoder = pipeline.text_encoder
tokenizer = pipeline.tokenizer
scheduler = pipeline.scheduler

# Freeze all parameters of the text encoder and U-Net (optional, only if you want to fine-tune specific parts)
for param in unet.parameters():
    param.requires_grad = False  

for param in text_encoder.parameters():
    param.requires_grad = False  
    
# Step 3: Set up the optimizer
optimizer = AdamW(list(unet.parameters()) + list(text_encoder.parameters()), lr=5e-5)



Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]



In [3]:
# Step 4: Training loop
device = "cuda" if torch.cuda.is_available() else "cpu"
unet.to(device)
text_encoder.to(device)

num_epochs = 5

for epoch in range(num_epochs):
    for batch in tqdm(dataloader):
        # Tokenize the prompts
        inputs = tokenizer(list(batch), return_tensors="pt", padding=True, truncation=True)
        input_ids = inputs.input_ids.to(device)

        # Encode the text using the text encoder
        encoder_hidden_states = text_encoder(input_ids)[0]

        ## Generate random noise for diffusion (dummy noise, as you are not generating images)
        noise = torch.randn((input_ids.shape[0], unet.in_channels, 64, 64), requires_grad=True).to(device)


        # Get random timesteps for each sample in the batch
        timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (input_ids.shape[0],), device=device).long()

        # Get model output by passing noise, timesteps, and encoder hidden states
        model_output = unet(noise, timesteps, encoder_hidden_states).sample

        # Define a dummy loss (MSE with random noise for simplicity)
        loss = torch.nn.functional.mse_loss(model_output, noise)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs} completed.")




  noise = torch.randn((input_ids.shape[0], unet.in_channels, 64, 64), requires_grad=True).to(device)
100%|██████████| 3/3 [05:42<00:00, 114.31s/it]


Epoch 1/5 completed.


100%|██████████| 3/3 [06:05<00:00, 121.72s/it]


Epoch 2/5 completed.


100%|██████████| 3/3 [05:32<00:00, 110.93s/it]


Epoch 3/5 completed.


100%|██████████| 3/3 [06:06<00:00, 122.17s/it]


Epoch 4/5 completed.


100%|██████████| 3/3 [05:30<00:00, 110.09s/it]


Epoch 5/5 completed.
Fine-tuning completed and model saved.


In [5]:
# Step 5: Save the fine-tuned model
unet.save_pretrained(r"C:\Users\siddh\Downloads\Case_Study_2")
text_encoder.save_pretrained(r"C:\Users\siddh\Downloads\Case_Study_2")
tokenizer.save_pretrained(r"C:\Users\siddh\Downloads\Case_Study_2")

print("Fine-tuning completed and model saved.")


Fine-tuning completed and model saved.
