In [13]:
from PIL import Image
from indicnlp.tokenize import indic_tokenize
import numpy as np
import torch
import albumentations as A
from albumentations.pytorch import ToTensorV2
import pandas as pd

In [8]:
def load_telugu_captions(filepath):
    captions_dict = {}
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            img_name, caption = parts[0].split("#")[0], parts[1]
            if img_name not in captions_dict:
                captions_dict[img_name] = []
            captions_dict[img_name].append(caption)
    return captions_dict

# Convert text to Unicode IDs (Simple Tokenizer Workaround)
def text_to_ids(text):
    tokens = tokenize_te(text)
    token_ids = [ord(char) for token in tokens for char in token]  # Unicode ID mapping
    return token_ids

In [9]:
class TeluguDataset:
    def __init__(self, df, tfms):
        self.df = df
        self.tfms = tfms

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        sample = self.df.iloc[idx, :]
        image_path = sample["image"]
        caption = sample["caption"]

        # Load and transform image
        image = Image.open(image_path).convert("RGB")
        image = np.array(image)
        image = self.tfms(image=image)["image"]

        # Process caption
        caption = f"{caption} <|endoftext|>"
        input_ids = text_to_ids(caption)
        labels = input_ids.copy()
        labels[:-1] = input_ids[1:]

        return image, input_ids, labels

In [10]:
def collate_fn(batch):
    images = [i[0] for i in batch]
    input_ids = [i[1] for i in batch]
    labels = [i[2] for i in batch]

    images = torch.stack(images, dim=0)
    max_len = max(len(seq) for seq in input_ids)

    # Padding
    input_ids_padded = torch.full((len(batch), max_len), fill_value=0, dtype=torch.long)
    labels_padded = torch.full((len(batch), max_len), fill_value=-100, dtype=torch.long)

    for i, (inp, lbl) in enumerate(zip(input_ids, labels)):
        input_ids_padded[i, : len(inp)] = torch.tensor(inp, dtype=torch.long)
        labels_padded[i, : len(lbl)] = torch.tensor(lbl, dtype=torch.long)

    return images, input_ids_padded, labels_padded


In [14]:
caption_path = "D:/ict/Data/fl8telugu.txt"
telugu_captions = load_telugu_captions(caption_path)

# Convert to DataFrame
df = pd.DataFrame(
    [{"image": f"D:/ict/Data/Images/{img}", "caption": caption} for img, captions in telugu_captions.items() for caption in captions]
)

# Define Transformations
train_tfms = A.Compose([
    A.HorizontalFlip(),
    A.RandomBrightnessContrast(),
    A.ColorJitter(),
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.3, rotate_limit=45, p=0.5),
    A.HueSaturationValue(p=0.3),
    A.Resize(224, 224),
    A.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], always_apply=True),
    ToTensorV2()
])


  original_init(self, **validated_kwargs)
  A.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], always_apply=True),


In [15]:
telugu_captions = load_telugu_captions("D:/ict/Data/fl8telugu.txt")

# Convert to DataFrame
df = pd.DataFrame(
    [{"image": img, "caption": caption} for img, captions in telugu_captions.items() for caption in captions]
)

print("Loaded Telugu Captions:", df.head()) 

Loaded Telugu Captions:                        image  \
0  1000268201_693b08cb0e.jpg   
1  1000268201_693b08cb0e.jpg   
2  1000268201_693b08cb0e.jpg   
3  1000268201_693b08cb0e.jpg   
4  1000268201_693b08cb0e.jpg   

                                             caption  
0  గులాబీ రంగు దుస్తులు ధరించిన పిల్లవాడు ప్రవేశ ...  
1               ఒక చెక్క భవనంలోకి వెళుతున్న అమ్మాయి.  
2       ఒక చిన్న అమ్మాయి చెక్క ప్లేహౌస్ పైకి ఎక్కడం.  
3     ఒక చిన్న అమ్మాయి తన ప్లేహౌస్కు మెట్లు ఎక్కేది.  
4  గులాబీ రంగు దుస్తులు ధరించిన ఒక చిన్న అమ్మాయి ...  


In [16]:
dataset = TeluguDataset(df, train_tfms)

# Test Preprocessing
sample_image, sample_input_ids, sample_labels = dataset[0]

print("Sample Image Shape:", sample_image.shape)
print("Sample Token IDs:", sample_input_ids)
print("Sample Labels:", sample_labels)

FileNotFoundError: [Errno 2] No such file or directory: '1000268201_693b08cb0e.jpg'