In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from torch.utils.data import DataLoader, Dataset
import numpy as np
from PIL import Image
import os
from collections import Counter
from nltk.tokenize import word_tokenize
import random
import pandas as pd

In [2]:
# Step 1: Load Raw Data for Vocabulary
csv_file = '/kaggle/input/faceattdb/final_version.csv'
image_folder = '/kaggle/input/faceattdb/images/'

data = pd.read_csv(csv_file)
captions = data['description'].tolist()  # Extract captions from the CSV

In [3]:
# Dataset Preparation

class ImageCaptionDataset(Dataset):
    def __init__(self, csv_file, image_folder, transform=None):
        self.data = pd.read_csv(csv_file)
        self.image_folder = image_folder
        self.transform = transform

        # Tokenize captions and build vocabulary
        self.vocab = self.build_vocab(self.data['description'].tolist())
        self.tokenized_captions = [self.caption_to_seq(caption) for caption in self.data['description']]

    def build_vocab(self, captions):
        tokens = []
        for caption in captions:
            tokens.extend(word_tokenize(caption.lower()))
        counter = Counter(tokens) #Count how many times a word appeaared in token list Ex: {'a': 10, 'man': 5, 'with': 4, 'dog': 3, ...}
        vocab = {word: idx + 2 for idx, (word, _) in enumerate(counter.items())} #Each word getting unique index st. from 2 (.items() will help in getting unique words)
        vocab["<PAD>"] = 0
        vocab["<SOS>"] = 1
        vocab["<EOS>"] = len(vocab)
        return vocab

    def caption_to_seq(self, caption):
        tokens = word_tokenize(caption.lower())
        seq = [self.vocab["<SOS>"]] + [self.vocab.get(token, self.vocab["<PAD>"]) for token in tokens] + [self.vocab["<EOS>"]]
        return seq

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_folder, self.data.iloc[idx,0])
        image = Image.open(img_name).convert('RGB')
        caption = self.tokenized_captions[idx]

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(caption)


In [4]:
# Image Transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [5]:
# Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 32
EMBED_SIZE = 256
HIDDEN_SIZE = 512
NUM_LAYERS = 3
LEARNING_RATE = 0.001
EPOCHS = 30
MAX_SEQ_LENGTH = 40

In [6]:
# Load Dataset
dataset =ImageCaptionDataset(csv_file="/kaggle/input/faceattdb/final_version.csv", image_folder="/kaggle/input/faceattdb/images", transform=transform)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda x: zip(*x))


In [7]:
# Model Definition
class CNNEncoder(nn.Module):
    def __init__(self, embed_size):
        super(CNNEncoder, self).__init__()
        resnet = models.resnet50(pretrained=True)
        self.cnn = nn.Sequential(*list(resnet.children())[:-1])  # Remove FC layer
        self.fc = nn.Linear(resnet.fc.in_features, embed_size)  # Access the in_features from the ResNet FC layer
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()

    def forward(self, images):
        features = self.cnn(images).squeeze(-1).squeeze(-1)  # Flatten AdaptiveAvgPool2d output
        features = self.fc(features)  # Map to the embedding size
        return self.relu(self.dropout(features))

In [8]:
class LSTMDecoder(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(LSTMDecoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, features, captions):
        embeddings = self.embed(captions[:, :-1])  # Skip <EOS>
        inputs = torch.cat((features.unsqueeze(1), embeddings), dim=1)
        hiddens, _ = self.lstm(inputs)
        outputs = self.fc(hiddens)
        return outputs

In [9]:
#combined model
class ImageCaptioningModel(nn.Module):
    def __init__(self, encoder, decoder):
        super(ImageCaptioningModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, images, captions):
        features = self.encoder(images)  # Extract image features
        outputs = self.decoder(features, captions)  # Generate captions
        return outputs


In [10]:
len(dataset.vocab)

537

In [11]:
# Initialize Model
vocab_size = len(dataset.vocab)
encoder = CNNEncoder(embed_size=EMBED_SIZE).to(device)
decoder = LSTMDecoder(embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, vocab_size=vocab_size, num_layers=NUM_LAYERS).to(device)
model = ImageCaptioningModel(encoder, decoder).to(device)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 213MB/s]


In [12]:
# Loss and Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocab["<PAD>"])
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [13]:
# Training Loop
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0

    for images, captions in dataloader:
        images = torch.stack(images).to(device)
        captions = nn.utils.rnn.pad_sequence(captions, batch_first=True, padding_value=dataset.vocab["<PAD>"]).to(device)   #Padding for same length in tensor

        # Forward pass
        outputs = model(images, captions)
        targets = captions[:, 1:]  # Skip <SOS>
        outputs = outputs[:, :targets.shape[1], :]  # Align sequence lengths

        # Compute loss
        loss = criterion(outputs.reshape(-1, vocab_size), targets.reshape(-1))
        epoch_loss += loss.item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch + 1}/{EPOCHS}], Loss: {epoch_loss / len(dataloader):.4f}")

Epoch [1/30], Loss: 4.3802
Epoch [2/30], Loss: 3.9387
Epoch [3/30], Loss: 3.8662
Epoch [4/30], Loss: 3.8060
Epoch [5/30], Loss: 3.5882
Epoch [6/30], Loss: 3.2559
Epoch [7/30], Loss: 2.8865
Epoch [8/30], Loss: 2.5667
Epoch [9/30], Loss: 2.3596
Epoch [10/30], Loss: 2.2294
Epoch [11/30], Loss: 2.1271
Epoch [12/30], Loss: 2.0412
Epoch [13/30], Loss: 1.9688
Epoch [14/30], Loss: 1.9033
Epoch [15/30], Loss: 1.8405
Epoch [16/30], Loss: 1.7763
Epoch [17/30], Loss: 1.7263
Epoch [18/30], Loss: 1.6736
Epoch [19/30], Loss: 1.6338
Epoch [20/30], Loss: 1.5610
Epoch [21/30], Loss: 1.5134
Epoch [22/30], Loss: 1.4683
Epoch [23/30], Loss: 1.4146
Epoch [24/30], Loss: 1.3665
Epoch [25/30], Loss: 1.3185
Epoch [26/30], Loss: 1.2719
Epoch [27/30], Loss: 1.2225
Epoch [28/30], Loss: 1.1826
Epoch [29/30], Loss: 1.1175
Epoch [30/30], Loss: 1.0708


In [14]:
def generate_caption(image, model, vocab, max_length=20):
    
    model.eval()
    with torch.no_grad():
        # Extract features using the encoder part of the model
        features = model.encoder(image.unsqueeze(0).to(device))
        
        # Initialize caption generation with the <SOS> token
        caption = [vocab["<SOS>"]]
        
        for _ in range(max_length):
            # Prepare input: current caption sequence
            inputs = torch.tensor(caption).unsqueeze(0).to(device)
            
            # Generate the next word using the decoder
            outputs = model.decoder(features, inputs)
            predicted = outputs.argmax(2)[:, -1].item()  # Get the index of the most probable word
            
            # Stop if the <EOS> token is generated
            if predicted == vocab["<EOS>"]:
                break
            
            # Append the predicted word to the caption sequence
            caption.append(predicted)
        
        # Convert token indices back to words
        caption_tokens = [k for k, v in vocab.items() if v in caption and v not in {vocab["<SOS>"], vocab["<EOS>"]}]
        return " ".join(caption_tokens)

# Test with an image
test_image = transform(Image.open("/kaggle/input/face-img/picture.jpeg").convert('RGB'))

# Generate a caption
generated_caption = generate_caption(test_image, model, dataset.vocab)
print("Generated Caption:", generated_caption)

Generated Caption: this an attractive with hair is straight and , young male oval


In [15]:
# Save the model's state dictionary
torch.save(model.state_dict(), "ImageCaptioning.pth")

## Now Matching with the closest celebrity Bollywood

In [16]:
from tqdm import tqdm

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
# 1. Feature Extraction Function
def extract_features(image_path, model, transform):

    model.eval()
    image = Image.open(image_path).convert("RGB")
    image_tensor = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        features = model.encoder(image_tensor).squeeze(0)
        features = features / features.norm(p=2)  # Normalize the feature vector
    return features

In [18]:
# 2. Build Celebrity Database
def build_celebrity_database(dataset_path, model, transform):

    celebrity_features = {}
    # Traverse through all parent folders
    for parent_folder in os.listdir(dataset_path):
        parent_path = os.path.join(dataset_path, parent_folder)
        if not os.path.isdir(parent_path):
            continue

        # Traverse through celebrity folders in each parent folder
        for celebrity_name in os.listdir(parent_path):
            celebrity_folder = os.path.join(parent_path, celebrity_name)
            if not os.path.isdir(celebrity_folder):
                continue

            # Extract features for all images of this celebrity
            all_features = []
            for image_name in os.listdir(celebrity_folder):
                image_path = os.path.join(celebrity_folder, image_name)
                try:
                    features = extract_features(image_path, model, transform)
                    all_features.append(features)
                except Exception as e:
                    print(f"Skipping {image_path}: {e}")

            # Compute the average feature vector for the celebrity
            if all_features:
                average_features = torch.stack(all_features).mean(dim=0)
                celebrity_features[celebrity_name] = average_features

    return celebrity_features

In [19]:
# 3. Match Against Celebrity Database
def match_with_celebrity(test_features, celebrity_features):

    best_match = None
    highest_similarity = -1

    for name, features in celebrity_features.items():
        similarity = torch.dot(test_features, features).item()  # Cosine similarity
        if similarity > highest_similarity:
            highest_similarity = similarity
            best_match = name

    return best_match, highest_similarity

In [20]:
dataset_path = "/kaggle/input/bollywood-celeb-localized-face-dataset/Bollywood_celeb_face_localized"

In [21]:
 # Initialize and load the trained model
vocab_size = len(dataset.vocab)
encoder = CNNEncoder(EMBED_SIZE).to(device)
decoder = LSTMDecoder(EMBED_SIZE, HIDDEN_SIZE, vocab_size, NUM_LAYERS).to(device)
model = ImageCaptioningModel(encoder, decoder).to(device)
model.load_state_dict(torch.load("/kaggle/input/face-image-captioning/pytorch/default/1/ImageCaptioning .pth"))
model.eval()

  model.load_state_dict(torch.load("/kaggle/input/face-image-captioning/pytorch/default/1/ImageCaptioning .pth"))


ImageCaptioningModel(
  (encoder): CNNEncoder(
    (cnn): Sequential(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (4): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplace=True

In [22]:
# Build the celebrity database
print("Building celebrity database...")
celebrity_features = build_celebrity_database(dataset_path, model, transform)

Building celebrity database...


In [23]:
 # Test image path
test_image_path = "/kaggle/input/face-img/picture.jpeg"
print("Matching test image...")
test_features = extract_features(test_image_path, model, transform)
match_name, similarity = match_with_celebrity(test_features, celebrity_features)

print(f"Matched Celebrity: {match_name}")
print(f"Similarity Score: {similarity:.4f}")

Matching test image...
Matched Celebrity: Preity_Zinta
Similarity Score: 1.0000


In [24]:
import pickle

def save_celebrity_features(celebrity_features, save_path):
    with open(save_path, "wb") as f:
        pickle.dump(celebrity_features, f)
    print(f"Celebrity features saved to {save_path}")

In [25]:
def load_celebrity_features(load_path):
    with open(load_path, "rb") as f:
        celebrity_features = pickle.load(f)
    print(f"Celebrity features loaded from {load_path}")
    return celebrity_features


In [26]:
save_path = "/kaggle/working/celebrity_features.pkl"
save_celebrity_features(celebrity_features, save_path)

Celebrity features saved to /kaggle/working/celebrity_features.pkl
