<a href="https://colab.research.google.com/github/Vivekdesai25/Evoastra_Internship/blob/main/Deep_Learning_(vivek)_Major_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ojaswi's part

## Step 1: Download and Set Up the MS COCO Dataset

In [None]:
# Download annotations zip
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip

# Unzip it
!unzip annotations_trainval2017.zip -d coco_annotations

# Confirm the file is there
import os
print("Files in annotations folder:", os.listdir("coco_annotations/annotations"))


--2025-08-02 12:13:12--  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.217.116.65, 16.15.193.148, 3.5.12.238, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|52.217.116.65|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252907541 (241M) [application/zip]
Saving to: ‘annotations_trainval2017.zip’


2025-08-02 12:13:15 (96.2 MB/s) - ‘annotations_trainval2017.zip’ saved [252907541/252907541]

Archive:  annotations_trainval2017.zip
  inflating: coco_annotations/annotations/instances_train2017.json  
  inflating: coco_annotations/annotations/instances_val2017.json  
  inflating: coco_annotations/annotations/captions_train2017.json  
  inflating: coco_annotations/annotations/captions_val2017.json  
  inflating: coco_annotations/annotations/person_keypoints_train2017.json  
  inflating: coco_annotations/annotations/person_keypoints_val2017.json  
Files in annotat

## Step 2: Load & Parse the Caption File

In [None]:
import json
from collections import defaultdict

# Path to JSON caption file
caption_file_path = "coco_annotations/annotations/captions_train2017.json"

# Load JSON data
with open(caption_file_path, 'r') as f:
    data = json.load(f)

# Create dictionary to hold image_id to list of captions
captions_dict = defaultdict(list)

# Loop through all caption annotations
for annot in data['annotations']:
    image_id = annot['image_id']
    caption = annot['caption']

    # Convert image_id (int) to filename format (str)
    image_filename = f"{image_id:012d}.jpg"

    # Append the caption to corresponding image key
    captions_dict[image_filename].append(caption)

# check number of images and a sample
print("Total unique images with captions:", len(captions_dict))
sample_key = list(captions_dict.keys())[0]
print(f"\nSample image filename: {sample_key}")
print(f"Captions for this image:\n{captions_dict[sample_key]}")


Total unique images with captions: 118287

Sample image filename: 000000203564.jpg
Captions for this image:
['A bicycle replica with a clock as the front wheel.', 'The bike has a clock as a tire.', 'A black metal bicycle with a clock inside the front wheel.', 'A bicycle figurine in which the front wheel is replaced with a clock\n', 'A clock with the appearance of the wheel of a bicycle ']


## Step 3: Clean and Preprocess Captions

In [None]:
import json
import string

# Load caption file again
with open("coco_annotations/annotations/captions_train2017.json", "r") as f:
    captions_data = json.load(f)

# Recreate image_id to captions mapping
image_captions = {}
for item in captions_data["annotations"]:
    img_id = f"{item['image_id']:012d}.jpg"
    caption = item["caption"]
    if img_id not in image_captions:
        image_captions[img_id] = []
    image_captions[img_id].append(caption)

# Clean each caption
def clean_caption(caption):
    caption = caption.lower()  # lowercase
    caption = caption.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    caption = ' '.join([word for word in caption.split() if word.isalpha()])  # remove non-alpha
    caption = '<start> ' + caption + ' <end>'  # add start/end
    return caption

# Clean all captions
cleaned_captions = {}
for img_id, caption_list in image_captions.items():
    cleaned_captions[img_id] = [clean_caption(c) for c in caption_list]

# Print
sample_id = list(cleaned_captions.keys())[0]
print("Image:", sample_id)
print("Cleaned Captions:")
for c in cleaned_captions[sample_id]:
    print(c)


Image: 000000203564.jpg
Cleaned Captions:
<start> a bicycle replica with a clock as the front wheel <end>
<start> the bike has a clock as a tire <end>
<start> a black metal bicycle with a clock inside the front wheel <end>
<start> a bicycle figurine in which the front wheel is replaced with a clock <end>
<start> a clock with the appearance of the wheel of a bicycle <end>


## Step 4: Tokenize and Convert Captions to Sequences

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle

# Flatten all captions into single list
all_captions = []
for cap_list in cleaned_captions.values():
    all_captions.extend(cap_list)

# Tokenizer setup
tokenizer = Tokenizer(oov_token="<unk>")
tokenizer.fit_on_texts(all_captions)

# Vocabulary size (+1 for padding)
vocab_size = len(tokenizer.word_index) + 1

# Convert captions to sequences
caption_seqs = {}
for img_id, cap_list in cleaned_captions.items():
    caption_seqs[img_id] = tokenizer.texts_to_sequences(cap_list)

# Check max length of any caption
max_length = max(len(seq) for cap_list in caption_seqs.values() for seq in cap_list)

# Show results
print(f"Total words in vocab: {vocab_size}")
print(f"Max caption length: {max_length}")

# Sample
sample_img = list(caption_seqs.keys())[0]
print(f"\nImage: {sample_img}")
print("Tokenized captions:")
for seq in caption_seqs[sample_img]:
    print(seq)


NameError: name 'cleaned_captions' is not defined

## Step 5: Pad the Sequences

In [None]:
from keras.preprocessing.sequence import pad_sequences

# Store tokenized captions properly
tokenized_captions = {}

for img_id, caps in cleaned_captions.items():
    tokenized = tokenizer.texts_to_sequences(caps)
    tokenized_captions[img_id] = tokenized

# Define max_len
max_len = max(len(seq) for cap_list in tokenized_captions.values() for seq in cap_list)

# Pad sequences
padded_captions = {}

for img_id, cap_seqs in tokenized_captions.items():
    padded_captions[img_id] = pad_sequences(cap_seqs, maxlen=max_len, padding='post')


### **Print**

In [None]:
sample_img = next(iter(padded_captions))
print(f"Image: {sample_img}")
print("Padded Captions Shape:", padded_captions[sample_img].shape)
print("First Padded Caption:", padded_captions[sample_img][0])


Image: 000000203564.jpg
Padded Captions Shape: (5, 51)
First Padded Caption: [   4    2  353 3787    9    2   83  122    7   40 1083    3    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0]


## Step 6: Save Preprocessed Captions + Tokenizer

In [None]:
import pickle

# Save tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Save padded captions
with open('padded_captions.pkl', 'wb') as f:
    pickle.dump(padded_captions, f)

# Save cleaned captions
with open('cleaned_captions.pkl', 'wb') as f:
    pickle.dump(cleaned_captions, f)

# Save max caption length
with open('max_len.txt', 'w') as f:
    f.write(str(max_len))


### **Check if it's saved**

In [None]:
import os

print("Files saved in your working directory:")
print(os.listdir())


Files saved in your working directory:
['.config', 'tokenizer.pkl', 'coco_annotations', 'padded_captions.pkl', 'cleaned_captions.pkl', 'annotations_trainval2017.zip', 'max_len.txt', 'sample_data']


# Step 7:Feature Extraction(Inception V3)

In [1]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Required Libraries
import tensorflow as tf
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
import numpy as np
import os
import pickle
from tqdm import tqdm

# Step 3: Load Pretrained InceptionV3 (CNN) Model
base_model = InceptionV3(weights='imagenet', include_top=False, pooling='avg')
model = Model(inputs=base_model.input, outputs=base_model.output)

# Step 4: Preprocessing Function
def load_preprocessed_img(img_path):
    img = image.load_img(img_path, target_size=(299, 299))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    return img_array

# Step 5: Feature Extraction Function
def extract_features_preprocessed(img_folder):
    features = {}
    # Get list of files in the folder
    image_files = [f for f in os.listdir(img_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    for img_name in tqdm(image_files):
        img_path = os.path.join(img_folder, img_name)
        img_input = load_preprocessed_img(img_path)
        feature = model.predict(img_input, verbose=0)
        features[img_name] = feature.reshape(-1)
    return features

# Step 6: Set Folder Path on Google Drive (Update this path if needed)
img_folder ='/content/drive/MyDrive/resized_images'

# Step 7: Extract Features
features = extract_features_preprocessed(img_folder)

# Step 8: Save Extracted Features to Google Drive
output_path = '/content/drive/MyDrive/image_features.pkl'
with open(output_path, 'wb') as f:
    pickle.dump(features, f)

print("CNN Feature extraction completed and saved to Drive!")

Mounted at /content/drive
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m87910968/87910968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/resized_images'

### **Go to Google Drive in Browser**

- Open https://drive.google.com/drive/my-drive
- Look for the folder `resized_images` in your **"My Drive"** section.

#### If you don’t see it, then it means that you have not added it to your own Drive yet.
-----------------------------------------------------------------------------
### How to add it:

To make Collab see the folder:

1. **Open the Drive link** Aditya shared.
2. Click **"Add Shortcut to Drive"** (or "Add to Drive").
3. Choose: **My Drive** (not "Shared with me").
3. Now it’ll appear in your "My Drive".
4. Refresh your Google Drive tab to check it's really there.

-----------------------------------------------------------------------------

Run the below code and see in the output if you can see one of the file's name's is 'resized_images'

If you can see, it means that the folder Aditya shared has been added into your google drive.

In [None]:
import os
print(os.listdir("/content/drive/MyDrive"))


## **STEP:8 LSTM CAPTIONING MODEL.**



In [None]:
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.models import Model

# Parameters (adjust according to your tokenizer and dataset)
vocab_size = 5000         # Replace with actual tokenizer.num_words or len(tokenizer.word_index) + 1
max_length = 34           # Replace with actual max caption length
embedding_dim = 256       # Size of word embeddings

# Define the model
def define_captioning_model(vocab_size, max_length):
    # Feature extractor (image vector input)
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(embedding_dim, activation='relu')(fe1)

    # Sequence processor (caption input)
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Decoder (merge image + text)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    return model

model = define_captioning_model(vocab_size, max_length)
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

## **STEP:9 CAPTION GENERATION**

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def generate_caption(model, tokenizer, photo, max_length):
    in_text = '<start>'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat, None)
        if word is None:
            break
        in_text += ' ' + word
        if word == '<end>':
            break
    final_caption = in_text.replace('<start>', '').replace('<end>', '').strip()
    return final_caption

## **STEP:10 USING WITH FEATURES.**

In [2]:
import pickle

# Load image features
with open('/content/drive/MyDrive/image_features.pkl', 'rb') as f:
    features = pickle.load(f)

# Choose one image
img_name = list(features.keys())[0]
photo_feature = features[img_name].reshape((1, 2048))  # Reshape for model input

# Generate caption
caption = generate_caption(model, tokenizer, photo_feature, max_length)
print(f"Caption for {img_name}: {caption}")

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/image_features.pkl'