In [40]:
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.ops import roi_align
from torchvision import transforms
from PIL import Image
import os
import numpy as np
from tqdm import tqdm

# Use pretrained Faster R-CNN
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True).eval().cuda()

# Transform
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])

# Paths
image_dir = "/home/vitoupro/code/image_captioning/data/raw/animals"
output_dir = "/home/vitoupro/code/image_captioning/region_features"
os.makedirs(output_dir, exist_ok=True)

# Process all images
for cls_folder in os.listdir(image_dir):
    full_cls_path = os.path.join(image_dir, cls_folder)
    for img_name in tqdm(os.listdir(full_cls_path), desc=f"Processing {cls_folder}"):
        img_path = os.path.join(full_cls_path, img_name)
        image = Image.open(img_path).convert("RGB")
        img_tensor = transform(image).unsqueeze(0).cuda()

        with torch.no_grad():
            # Get region proposals (boxes)
            detections = model(img_tensor)[0]
            boxes = detections["boxes"]
            scores = detections["scores"]

            keep = scores > 0.5
            if keep.sum() == 0:
                boxes = boxes[:36]
            else:
                boxes = boxes[keep][:36]


            if boxes.size(0) == 0:
                print(f"[!] Skipping {img_name} due to no confident regions")
                continue

            # Get feature map from backbone
            features = model.backbone(img_tensor.tensors if hasattr(img_tensor, 'tensors') else img_tensor)["0"]  # [B, 256, H, W]

            # Prepare RoIs in format (image_idx, x1, y1, x2, y2)
            image_indices = torch.zeros((boxes.shape[0], 1), device=boxes.device)
            rois = torch.cat([image_indices, boxes], dim=1)

            # RoI Align to extract region features
            region_feats = roi_align(features, rois, output_size=(7, 7), spatial_scale=1/4, aligned=True)  # [N, 256, 7, 7]
            region_feats = torch.nn.functional.adaptive_avg_pool2d(region_feats, (1, 1))  # [N, 256, 1, 1]
            region_feats = region_feats.view(region_feats.size(0), -1)  # [N, 256]
            region_feats = region_feats.cpu().numpy()

        # Save .npy file
        save_name = f"{cls_folder}_{img_name.replace('.jpg', '')}.npy"
        save_path = os.path.join(output_dir, save_name)
        np.save(save_path, region_feats)


Processing wolf: 100%|██████████| 60/60 [00:03<00:00, 18.61it/s]
Processing deer: 100%|██████████| 60/60 [00:03<00:00, 19.32it/s]
Processing rhinoceros: 100%|██████████| 60/60 [00:03<00:00, 18.86it/s]
Processing raccoon: 100%|██████████| 60/60 [00:03<00:00, 18.99it/s]
Processing eagle: 100%|██████████| 60/60 [00:03<00:00, 19.10it/s]
Processing shark: 100%|██████████| 60/60 [00:03<00:00, 19.35it/s]
Processing leopard: 100%|██████████| 60/60 [00:03<00:00, 19.35it/s]
Processing flamingo: 100%|██████████| 60/60 [00:03<00:00, 18.93it/s]
Processing octopus: 100%|██████████| 60/60 [00:03<00:00, 19.32it/s]
Processing lizard: 100%|██████████| 60/60 [00:03<00:00, 19.81it/s]
Processing owl: 100%|██████████| 60/60 [00:03<00:00, 18.80it/s]
Processing horse: 100%|██████████| 60/60 [00:03<00:00, 19.27it/s]
Processing bee: 100%|██████████| 60/60 [00:03<00:00, 19.25it/s]
Processing penguin: 100%|██████████| 60/60 [00:03<00:00, 18.40it/s]
Processing pigeon: 100%|██████████| 60/60 [00:03<00:00, 18.72it/s

In [53]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
import os, json, re
from sklearn.model_selection import train_test_split
import jiwer

# ==== Load Vocabulary ====
def load_vocabulary(path):
    with open(path, 'r') as file:
        idx2word = json.load(file)
    word2idx = {v: int(k) for k, v in idx2word.items()}
    return idx2word, word2idx

idx2word, word2idx = load_vocabulary('/home/vitoupro/code/image_captioning/data/processed/idx2word.json')

# ==== Khmer Encoding ====
def encode_khmer_word(word, word2idx):
    return [word2idx.get(ch, word2idx['<UNK>']) for ch in word], None

def decode_indices(indices, idx2word):
    return ''.join([idx2word.get(str(idx), '') for idx in indices]), None

# ==== Attention ====
class Attention(nn.Module):
    def __init__(self, encoder_dim, decoder_dim, attention_dim):
        super().__init__()
        self.attn = nn.Linear(encoder_dim + decoder_dim, attention_dim)
        self.v = nn.Linear(attention_dim, 1)

    def forward(self, encoder_out, hidden):
        hidden = hidden.unsqueeze(1).repeat(1, encoder_out.size(1), 1)
        energy = torch.tanh(self.attn(torch.cat((encoder_out, hidden), dim=2)))
        alpha = torch.softmax(self.v(energy).squeeze(2), dim=1)
        context = (encoder_out * alpha.unsqueeze(2)).sum(dim=1)
        return context, alpha

# ==== Decoder ====
class BottomUpDecoder(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, region_feat_size=256, attention_dim=36):
        super().__init__()
        self.attn = Attention(region_feat_size, hidden_size, attention_dim)  # region_feat_size = 2048
        self.lstm = nn.LSTM(embed_size + region_feat_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.init_h = nn.Linear(region_feat_size, hidden_size)  # 👈 region_feat_size must match actual input
        self.init_c = nn.Linear(region_feat_size, hidden_size)
        self.embed = nn.Embedding(vocab_size, embed_size)


    def forward(self, region_feats, captions, sampling_probability=1.0):
    # region_feats: [B, num_regions, 2048]
        embedded = self.embed(captions)  # [B, seq_len, embed_size]
          # 🔥 this should be [B, 2048]
        


    # Init LSTM hidden state using average of region features
        mean_feats = region_feats.mean(dim=1) 
       
        h = self.init_h(mean_feats).unsqueeze(0)  # [1, B, hidden]
        c = self.init_c(mean_feats).unsqueeze(0)

        outputs = []
        inputs = embedded[:, 0, :].unsqueeze(1)  # [B, 1, embed]

        for t in range(1, captions.size(1)):
            context, _ = self.attn(region_feats, h[-1])  # [B, 2048]
            lstm_input = torch.cat((inputs.squeeze(1), context), dim=1).unsqueeze(1)  # [B, 1, embed+2048]
            output, (h, c) = self.lstm(lstm_input, (h, c))  # output: [B, 1, hidden]
            output = self.linear(output.squeeze(1))  # [B, vocab_size]
            outputs.append(output)

            teacher_force = torch.rand(1).item() > sampling_probability
            top1 = output.argmax(1)
            inputs = embedded[:, t, :].unsqueeze(1) if teacher_force else self.embed(top1).unsqueeze(1)

        return torch.stack(outputs, dim=1)


# ==== Dataset ====
class BottomUpCaptionDataset(Dataset):
    def __init__(self, img_labels, feature_dir, vocab, max_length=20):
        self.img_labels = img_labels
        self.feature_dir = feature_dir
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        row = self.img_labels.iloc[idx]
        image_path = row['image']
        class_name = os.path.dirname(image_path)
        file_name = os.path.basename(image_path).replace('.jpg', '')
        feature_filename = f"{class_name}_{file_name}.npy"
        feature_path = os.path.join(self.feature_dir, feature_filename)

        if not os.path.exists(feature_path):
            raise FileNotFoundError(f"Missing feature: {feature_path}")
        
        max_regions = 36
        region_feats = np.load(feature_path)  # shape: [N, 256] (N can vary)

        if len(region_feats.shape) == 1:
            region_feats = region_feats.reshape(1, -1)  # Fix shape if it's (256,)

        num_regions = region_feats.shape[0]

        if num_regions < max_regions:
            pad = np.zeros((max_regions - num_regions, region_feats.shape[1]), dtype=np.float32)
            region_feats = np.vstack((region_feats, pad))
        else:
            region_feats = region_feats[:max_regions]

        region_feats = torch.tensor(region_feats).float()


        
# Should be [36, 2048] or similar

        caption = row['caption']
        indices, _ = encode_khmer_word(caption, self.vocab)
        tokens = [self.vocab['<START>']] + indices + [self.vocab['<END>']]
        tokens += [self.vocab['<PAD>']] * (self.max_length - len(tokens))
        return region_feats, torch.tensor(tokens[:self.max_length])



# ==== Load and Split ====
all_df = pd.read_csv('/home/vitoupro/code/image_captioning/data/raw/annotation.txt', delimiter=' ', names=['image', 'caption'])
train_df, eval_df = train_test_split(all_df, test_size=0.2, random_state=42)
feature_dir = '/home/vitoupro/code/image_captioning/region_features'

train_loader = DataLoader(BottomUpCaptionDataset(train_df, feature_dir, word2idx), batch_size=32, shuffle=True)
eval_loader = DataLoader(BottomUpCaptionDataset(eval_df, feature_dir, word2idx), batch_size=32)

# ==== Model ====
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BottomUpDecoder(embed_size=256, hidden_size=512, vocab_size=len(word2idx)).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<PAD>'])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# ==== Training ====
def calculate_wer(gt, pred): return jiwer.wer(gt, pred)
def calculate_cer(gt, pred): return jiwer.cer(gt, pred)

def decode_for_metrics(tensor): return decode_indices(tensor.tolist(), idx2word)[0]

def evaluate(model, loader, epoch):
    model.eval()
    total_wer, total_cer, count = 0, 0, 0
    with torch.no_grad():
        for feats, captions in loader:
            feats, captions = feats.to(device), captions.to(device)
            outputs = model(feats, captions[:, :-1])
            preds = outputs.argmax(-1)
            for i in range(len(captions)):
                gt = decode_for_metrics(captions[i])
                pred = decode_for_metrics(preds[i])
                total_wer += calculate_wer(gt, pred)
                total_cer += calculate_cer(gt, pred)
                count += 1
    print(f"[EVAL] Epoch {epoch+1}: WER: {total_wer/count:.2f} CER: {total_cer/count:.2f}")




In [54]:
# ==== Loop ====
num_epochs = 15
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for feats, captions in train_loader:
        feats, captions = feats.to(device), captions.to(device)
        outputs = model(feats, captions, sampling_probability=max(0.1, 1.0 - epoch * 0.05))
        loss = criterion(outputs.view(-1, len(word2idx)), captions[:, 1:].reshape(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"[TRAIN] Epoch {epoch+1}: Loss = {total_loss/len(train_loader):.4f}")
    evaluate(model, eval_loader, epoch)

# ==== Save Weights ====
torch.save(model.state_dict(), 'bottomup_decoder.pth')

[TRAIN] Epoch 1: Loss = 2.9164
[EVAL] Epoch 1: WER: 1.00 CER: 0.52
[TRAIN] Epoch 2: Loss = 2.3474
[EVAL] Epoch 2: WER: 1.00 CER: 0.51
[TRAIN] Epoch 3: Loss = 1.9682
[EVAL] Epoch 3: WER: 1.00 CER: 0.53
[TRAIN] Epoch 4: Loss = 1.6077
[EVAL] Epoch 4: WER: 1.00 CER: 0.54
[TRAIN] Epoch 5: Loss = 1.3030
[EVAL] Epoch 5: WER: 1.00 CER: 0.56
[TRAIN] Epoch 6: Loss = 1.1124
[EVAL] Epoch 6: WER: 1.00 CER: 0.57
[TRAIN] Epoch 7: Loss = 0.8887
[EVAL] Epoch 7: WER: 1.00 CER: 0.53
[TRAIN] Epoch 8: Loss = 0.7512
[EVAL] Epoch 8: WER: 1.00 CER: 0.55
[TRAIN] Epoch 9: Loss = 0.7197
[EVAL] Epoch 9: WER: 1.00 CER: 0.58
[TRAIN] Epoch 10: Loss = 0.5668
[EVAL] Epoch 10: WER: 1.00 CER: 0.55
[TRAIN] Epoch 11: Loss = 0.5046
[EVAL] Epoch 11: WER: 1.00 CER: 0.55
[TRAIN] Epoch 12: Loss = 0.4806
[EVAL] Epoch 12: WER: 1.00 CER: 0.55
[TRAIN] Epoch 13: Loss = 0.3907
[EVAL] Epoch 13: WER: 1.00 CER: 0.55
[TRAIN] Epoch 14: Loss = 0.3617
[EVAL] Epoch 14: WER: 1.00 CER: 0.56
[TRAIN] Epoch 15: Loss = 0.2909
[EVAL] Epoch 15: WER

In [60]:
import torch
import torchvision
from torchvision.ops import roi_align
from torchvision import transforms
from PIL import Image
import numpy as np
import os

def extract_region_features(image_path, output_path, device='cuda', score_threshold=0.5, top_k=36):
    # Load pretrained Faster R-CNN
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True).eval().to(device)

    # Transform for input image
    transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
    ])

    # Load and transform image
    image = Image.open(image_path).convert("RGB")
    img_tensor = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        # Get predictions
        detections = model(img_tensor)[0]
        boxes = detections["boxes"]
        scores = detections["scores"]

        # Keep high-score regions
        keep = scores > score_threshold
        if keep.sum() == 0:
            boxes = boxes[:top_k]
        else:
            boxes = boxes[keep][:top_k]

        if boxes.size(0) == 0:
            print(f"[!] Skipping {os.path.basename(image_path)} due to no confident regions")
            return False

        # Get backbone features
        features = model.backbone(img_tensor)["0"]  # Feature map: [B, 256, H, W]

        # Prepare RoIs: (image_idx, x1, y1, x2, y2)
        image_indices = torch.zeros((boxes.shape[0], 1), device=boxes.device)
        rois = torch.cat([image_indices, boxes], dim=1)

        # RoI Align: [N, 256, 7, 7]
        region_feats = roi_align(features, rois, output_size=(7, 7), spatial_scale=1/4, aligned=True)

        # Average pool: [N, 256]
        region_feats = torch.nn.functional.adaptive_avg_pool2d(region_feats, (1, 1))
        region_feats = region_feats.view(region_feats.size(0), -1)
        region_feats = region_feats.cpu().numpy()

    # Save features
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    np.save(output_path, region_feats)
    print(f"[✓] Saved: {output_path}")
    return True


# === Example usage ===
if __name__ == "__main__":
    image_path = "/home/vitoupro/code/image_captioning/data/00000001_020.jpg"
    output_path = "/home/vitoupro/code/image_captioning/region_features/00000001_020.jpg"

    extract_region_features(image_path, output_path)


[✓] Saved: /home/vitoupro/code/image_captioning/region_features/00000001_020.jpg


In [63]:
import torch
import torch.nn as nn
import numpy as np
import json
from torchvision import transforms
from PIL import Image


# --- Define Attention ---
class Attention(nn.Module):
    def __init__(self, encoder_dim, decoder_dim, attention_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(encoder_dim + decoder_dim, attention_dim)
        self.v = nn.Linear(attention_dim, 1)

    def forward(self, encoder_out, hidden):
        hidden = hidden.unsqueeze(1).repeat(1, encoder_out.size(1), 1)
        attn_input = torch.cat((encoder_out, hidden), dim=2)
        energy = torch.tanh(self.attn(attn_input))
        attention = self.v(energy).squeeze(2)
        alpha = torch.softmax(attention, dim=1)
        context = (encoder_out * alpha.unsqueeze(2)).sum(dim=1)
        return context, alpha


# --- Bottom-Up Decoder ---
class BottomUpDecoder(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, region_feat_size=256, attention_dim=256):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.attn = Attention(region_feat_size, hidden_size, attention_dim)
        self.lstm = nn.LSTM(embed_size + region_feat_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.init_h = nn.Linear(region_feat_size, hidden_size)
        self.init_c = nn.Linear(region_feat_size, hidden_size)

    def forward(self, region_feats, captions, sampling_probability=1.0):
        raise NotImplementedError("This is for inference only")


# --- Vocabulary Loader ---
def load_vocabulary(path):
    with open(path, 'r') as file:
        idx2word = json.load(file)
    word2idx = {v: int(k) for k, v in idx2word.items()}
    return idx2word, word2idx


# --- Prediction Function ---
def predict_caption_bottomup(region_feat_path, decoder, device, idx2word, word2idx, max_length=20):
    decoder.eval()

    region_feats = np.load(region_feat_path)
    if len(region_feats.shape) == 1:
        region_feats = region_feats.reshape(1, -1)
    if region_feats.shape[0] < 36:
        pad = np.zeros((36 - region_feats.shape[0], region_feats.shape[1]), dtype=np.float32)
        region_feats = np.vstack((region_feats, pad))
    else:
        region_feats = region_feats[:36]

    region_feats = torch.tensor(region_feats, dtype=torch.float32).unsqueeze(0).to(device)

    h = decoder.init_h(region_feats.mean(1)).unsqueeze(0)
    c = decoder.init_c(region_feats.mean(1)).unsqueeze(0)

    input_idx = torch.tensor([word2idx['<START>']], dtype=torch.long).to(device)
    predictions = []

    for _ in range(max_length):
        embedded = decoder.embed(input_idx).unsqueeze(1)
        context, _ = decoder.attn(region_feats, h[-1])
        lstm_input = torch.cat((embedded.squeeze(1), context), dim=1).unsqueeze(1)
        output, (h, c) = decoder.lstm(lstm_input, (h, c))
        output = decoder.linear(output.squeeze(1))
        predicted_index = output.argmax(-1).item()

        if predicted_index == word2idx['<END>']:
            break

        predictions.append(idx2word[str(predicted_index)])
        input_idx = torch.tensor([predicted_index], dtype=torch.long).to(device)

    return ''.join(predictions)


# --- Main Execution ---
if __name__ == '__main__':
    # Config
    region_feat_path = '/home/vitoupro/code/image_captioning/region_features/00000001_020.jpg.npy'
    decoder_path = '/home/vitoupro/code/image_captioning/notebook/bottomup_decoder.pth'
    vocab_path = '/home/vitoupro/code/image_captioning/data/processed/idx2word.json'

    # Load vocab
    idx2word, word2idx = load_vocabulary(vocab_path)

    # Device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Init model
    decoder = BottomUpDecoder(
        embed_size=256,
        hidden_size=512,
        vocab_size=len(word2idx),
        region_feat_size=256,
        attention_dim=36
    ).to(device)

    # Load weights
    decoder.load_state_dict(torch.load(decoder_path, map_location=device))

    # Predict
    caption = predict_caption_bottomup(region_feat_path, decoder, device, idx2word, word2idx)
    print("Predicted Caption:", caption)


Predicted Caption: ហាមស្ទ័រ
