In [1]:
!pip install tensorflow keras numpy pandas matplotlib tqdm gTTS opencv-python pillow

Collecting tensorflow
  Using cached tensorflow-2.18.0-cp311-cp311-win_amd64.whl.metadata (3.3 kB)
Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl.metadata (20 kB)
Using cached tensorflow-2.18.0-cp311-cp311-win_amd64.whl (7.5 kB)
Downloading opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl (39.5 MB)
   ---------------------------------------- 0.0/39.5 MB ? eta -:--:--
   ---------------------------------------- 0.3/39.5 MB ? eta -:--:--
    --------------------------------------- 0.5/39.5 MB 1.5 MB/s eta 0:00:26
    --------------------------------------- 0.8/39.5 MB 1.5 MB/s eta 0:00:27
   - -------------------------------------- 1.0/39.5 MB 1.4 MB/s eta 0:00:28
   - -------------------------------------- 1.3/39.5 MB 1.3 MB/s eta 0:00:31
   - -------------------------------------- 1.3/39.5 MB 1.3 MB/s eta 0:00:31
   - -------------------------------------- 1.6/39.5 MB 1.1 MB/s eta 0:00:34
   - -------------------------------------- 1.6/39.5 MB 1.1

In [6]:
DATASET_PATH = r"G:\coco-2017\coco2017\train2017"
ANNOTATIONS_FILE = r"G:\coco-2017\coco2017\annotations\captions_train2017.json"

# Check if paths exist
print("Train Images Folder Exists:", os.path.exists(DATASET_PATH))
print("Annotations File Exists:", os.path.exists(ANNOTATIONS_FILE))


Train Images Folder Exists: True
Annotations File Exists: True


In [7]:
with open(ANNOTATIONS_FILE, 'r') as f:
    annotations = json.load(f)

# Create a dictionary to map images to their captions
image_captions = {}
for annot in annotations['annotations']:
    image_id = annot['image_id']
    image_filename = f"{image_id:012d}.jpg"
    caption = annot['caption']

    if image_filename not in image_captions:
        image_captions[image_filename] = []
    image_captions[image_filename].append(caption)

print("Total Images with Captions:", len(image_captions))

Total Images with Captions: 118287


In [8]:
import os
import json

# Define dataset paths
DATASET_PATH = r"G:\coco-2017\coco2017\train2017"
ANNOTATIONS_FILE = r"G:\coco-2017\coco2017\annotations\captions_train2017.json"

# Check if paths exist
print("Train Images Folder Exists:", os.path.exists(DATASET_PATH))
print("Annotations File Exists:", os.path.exists(ANNOTATIONS_FILE))

# Load annotations
with open(ANNOTATIONS_FILE, 'r') as f:
    annotations = json.load(f)

# Check sample annotations
print("Total annotations:", len(annotations['annotations']))
print("Sample annotation:", annotations['annotations'][0])


Train Images Folder Exists: True
Annotations File Exists: True
Total annotations: 591753
Sample annotation: {'image_id': 203564, 'id': 37, 'caption': 'A bicycle replica with a clock as the front wheel.'}


In [10]:
import string
from pickle import dump

# Dictionary to store image-caption mappings
image_captions = {}

for annot in annotations['annotations']:
    image_id = annot['image_id']
    image_filename = f"{image_id:012d}.jpg"
    caption = annot['caption']

    # Clean text (lowercase, remove punctuation)
    caption = caption.lower().translate(str.maketrans("", "", string.punctuation))

    if image_filename not in image_captions:
        image_captions[image_filename] = []
    image_captions[image_filename].append(caption)

# Save cleaned captions
dump(image_captions, open("captions.pkl", "wb"))
print("Captions cleaned and saved!")


Captions cleaned and saved!


In [14]:
import torch.nn as nn

class ImageCaptioningModel(nn.Module):
    def __init__(self, vocab_size, embed_size=256, hidden_size=512):
        super(ImageCaptioningModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, features, captions):
        captions = self.embedding(captions)
        lstm_input = torch.cat((features.unsqueeze(1), captions), dim=1)
        output, _ = self.lstm(lstm_input)
        output = self.fc(output)
        return output

# Example Model Initialization
vocab_size = 5000  # Adjust as needed
model = ImageCaptioningModel(vocab_size)
print(model)


ImageCaptioningModel(
  (embedding): Embedding(5000, 256)
  (lstm): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=5000, bias=True)
)


In [19]:
import torch.optim as optim

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Example Training Loop
num_epochs = 5
for epoch in range(num_epochs):
    optimizer.zero_grad()
    
    # Example inputs
    sample_features = torch.randn(32, 512)  # Replace with real feature vectors
    sample_captions = torch.randint(0, vocab_size, (32, 20))  # Random captions

    optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}]")


Epoch [1/5]
Epoch [2/5]
Epoch [3/5]
Epoch [4/5]
Epoch [5/5]


In [21]:
import torch
import torchvision

print("Torch Version:", torch.__version__)
print("Torchvision Version:", torchvision.__version__)


Torch Version: 2.6.0+cpu
Torchvision Version: 0.21.0+cpu


In [22]:
print("Is CUDA Available:", torch.cuda.is_available())
print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))


Is CUDA Available: False
Device: cpu


In [24]:
image_path = r"G:\coco-2017\coco2017\train2017\000000040204.jpg"  # Use an actual image file

from PIL import Image
image = Image.open(image_path)
image.show()


In [25]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms

# Load EfficientNet model (pretrained on ImageNet)
model = models.efficientnet_b0(pretrained=True)
model = torch.nn.Sequential(*list(model.children())[:-1])  # Remove last classification layer
model.eval()  # Set to evaluation mode

# Define image transformations
transform = transforms.Compose([
    transforms.Resize((299, 299)),  # Resize like Xception
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])


In [26]:
from PIL import Image
import numpy as np

def extract_features(image_path):
    image = Image.open(image_path).convert("RGB")  # Ensure RGB format
    image = transform(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = model(image)
    return features.squeeze().numpy()  # Convert to NumPy array

# Extract features for your image
features = extract_features(image_path)
print("Feature Vector Shape:", features.shape)


Feature Vector Shape: (1280,)


In [27]:
import json

ANNOTATIONS_FILE = r"G:\coco-2017\coco2017\annotations\captions_train2017.json"

# Load captions
with open(ANNOTATIONS_FILE, 'r') as f:
    annotations = json.load(f)

# Print first annotation
print(annotations["annotations"][0])


{'image_id': 203564, 'id': 37, 'caption': 'A bicycle replica with a clock as the front wheel.'}


In [28]:
# Convert image filename to image_id
image_id = int(image_path.split("\\")[-1].split(".")[0])  # Extract image ID from filename

# Get all captions for this image
image_captions = [annot["caption"] for annot in annotations["annotations"] if annot["image_id"] == image_id]

print("Captions for this image:", image_captions)


Captions for this image: ['a red and white plane is taking off ', 'A airplane that is flying in the sky.', 'A passenger airplane flies low to the ground.', 'A large plane taking off from a runway.', 'An airplane just taking off from the airport']


In [29]:
import torch.nn as nn

class ImageCaptioningModel(nn.Module):
    def __init__(self, vocab_size, embed_size=256, hidden_size=512):
        super(ImageCaptioningModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, features, captions):
        captions = self.embedding(captions)
        lstm_input = torch.cat((features.unsqueeze(1), captions), dim=1)
        output, _ = self.lstm(lstm_input)
        output = self.fc(output)
        return output

# Example Model Initialization
vocab_size = 5000  # Adjust based on tokenizer
model = ImageCaptioningModel(vocab_size)
print(model)


ImageCaptioningModel(
  (embedding): Embedding(5000, 256)
  (lstm): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=5000, bias=True)
)


In [51]:
import torch

# Set the correct model path
model_path = r"G:\image_generation_caption.pth"  # Use the correct filename

# Load the trained model
model = ImageCaptioningModel(vocab_size=5000)  # Ensure the vocab size matches your training
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

print("Model loaded successfully!")


Model loaded successfully!


In [53]:
from torchvision.models import EfficientNet_B0_Weights

# Load EfficientNet for feature extraction (Updated method)
cnn_model = models.efficientnet_b0(weights=EfficientNet_B0_Weights.IMAGENET1K_V1)
cnn_model = torch.nn.Sequential(*list(cnn_model.children())[:-1])  # Remove classification layer
cnn_model.eval()  # Set to evaluation mode

# Define image transformation pipeline
transform = transforms.Compose([
    transforms.Resize((299, 299)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Function to extract image features
def extract_features(image_path):
    image = Image.open(image_path).convert("RGB")  # Ensure RGB format
    image = transform(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = cnn_model(image)
    return features.squeeze().detach()


In [54]:
import os

# Check if tokenizer.pkl exists
tokenizer_path = r"G:\tokenizer.pkl"
print("Tokenizer exists:", os.path.exists(tokenizer_path))

# List all files in G:\
print("Files in G:\\", os.listdir("G:\\"))


Tokenizer exists: True
Files in G:\ ['$RECYCLE.BIN', '.ipynb_checkpoints', 'captions.pkl', 'coco-2017', 'coco-2017.zip', 'image_caption.ipynb', 'image_generation_caption.pth', 'image_open.ipynb', 'image_voice.ipynb', 'ml_dataset.csv', 'System Volume Information', 'tokenizer.pkl', 'Untitled.ipynb', 'uploading_dataset.ipynb']


In [57]:
import nltk
from nltk.tokenize import word_tokenize
import pickle
import os

# Download required dataset for tokenization
nltk.download('punkt')

# Sample dataset
training_captions = [
    "a man is surfing on a big wave",
    "a dog is playing in the park",
    "a group of people are riding bicycles"
]

# Tokenize each caption
tokenized_captions = [word_tokenize(sentence.lower()) for sentence in training_captions]

# Build vocabulary
word_to_index = {"<pad>": 0, "<start>": 1, "<end>": 2, "<unk>": 3}  # Special tokens
for caption in tokenized_captions:
    for word in caption:
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)

# Save the tokenizer
tokenizer_path = r"G:\tokenizer.pkl"
with open(tokenizer_path, "wb") as f:
    pickle.dump(word_to_index, f)

print(f"✅ Tokenizer saved successfully at {tokenizer_path}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...


✅ Tokenizer saved successfully at G:\tokenizer.pkl


[nltk_data]   Package punkt is already up-to-date!


In [58]:
# Correct tokenizer path
tokenizer_path = r"G:\tokenizer.pkl"

# Load the tokenizer
with open(tokenizer_path, "rb") as f:
    tokenizer = pickle.load(f)

print("✅ Tokenizer loaded successfully!")


✅ Tokenizer loaded successfully!


In [59]:
def simple_tokenizer(text):
    return text.lower().split()  # Simple split by spaces

# Example
training_captions = [
    "A man is surfing on a big wave",
    "A dog is playing in the park",
    "A group of people are riding bicycles"
]

tokenized_captions = [simple_tokenizer(sentence) for sentence in training_captions]

print("Tokenized Captions:", tokenized_captions)


Tokenized Captions: [['a', 'man', 'is', 'surfing', 'on', 'a', 'big', 'wave'], ['a', 'dog', 'is', 'playing', 'in', 'the', 'park'], ['a', 'group', 'of', 'people', 'are', 'riding', 'bicycles']]
