In [1]:
!pip install pandas transformers scikit-learn matplotlib

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os




In [2]:
image_features = np.load('/content/drive/MyDrive/Captions and img features/inception_features.npy')   # shape (N, 2048)
labels = np.load('/content/drive/MyDrive/Captions and img features/inception_labels.npy')             # shape (N, num_classes or 1-hot)

# If one-hot, convert to class indices
if len(labels.shape) > 1:
    labels = np.argmax(labels, axis=1)


In [5]:
print(f"Image features shape: {image_features.shape}")


Image features shape: (4385, 2048)


In [6]:
# Trim captions to match features
caption_texts = caption_texts[:image_features.shape[0]]


In [10]:
!pip install tensorflow


Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 (from tensorflow)
  Downloading protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1 (from tensorf

Load Labels

In [11]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import numpy as np

datagen = ImageDataGenerator()
generator = datagen.flow_from_directory(
    '/content/drive/MyDrive/Dataset',  # Your dataset with subfolders per class
    target_size=(299, 299),
    batch_size=1,
    class_mode=None,
    shuffle=False
)

# Extract just the image filenames (without class folder prefix)
image_filenames = [os.path.basename(f) for f in generator.filenames]

# Save for future use
np.save('/content/drive/MyDrive/Dataset/image_filenames.npy', image_filenames)

print(f"Saved {len(image_filenames)} image filenames.")


Found 4385 images belonging to 5 classes.
Saved 4385 image filenames.


In [13]:
import os

# Step 1: Normalize caption_map keys
caption_map = {}
for file in csv_files:
    df = pd.read_csv(file)
    for img, cap in zip(df['image'], df['caption']):
        img = os.path.basename(img).strip().lower()  # Strip paths, normalize
        caption_map[img] = cap

# Step 2: Load and normalize image_filenames
image_filenames = np.load('/content/drive/MyDrive/Dataset/image_filenames.npy', allow_pickle=True)
image_filenames = [os.path.basename(f).strip().lower() for f in image_filenames]

# Step 3: Match captions safely
missing = 0
caption_texts = []
for img_name in image_filenames:
    if img_name in caption_map:
        caption_texts.append(caption_map[img_name])
    else:
        caption_texts.append("No caption found")  # Or handle differently
        missing += 1

print(f"✅ Loaded {len(caption_texts)} captions.")
print(f"⚠️ Missing captions for {missing} images.")


✅ Loaded 4385 captions.
⚠️ Missing captions for 201 images.


In [14]:
valid_features = []
valid_captions = []

for i, img_name in enumerate(image_filenames):
    if img_name in caption_map:
        valid_features.append(image_features[i])
        valid_captions.append(caption_map[img_name])

# Convert to arrays
image_features = np.array(valid_features)
caption_texts = valid_captions

print(f"After filtering: {len(image_features)} features and {len(caption_texts)} captions.")


After filtering: 4184 features and 4184 captions.


In [16]:
# Load classification labels
labels = np.load('/content/drive/MyDrive/Captions and img features/inception_labels.npy')  # already aligned

# Convert one-hot to class index if needed
if len(labels.shape) > 1:
    labels = np.argmax(labels, axis=1)

print("Labels shape:", labels.shape)


Labels shape: (4385,)


Tokenize Captions (BERT)

In [17]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize all captions
tokenized = tokenizer(caption_texts, padding='max_length', truncation=True, max_length=50, return_tensors='pt')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [30]:
# If filtered_labels is a NumPy array, do:
filtered_labels = filtered_labels - 1

 Build Custom Dataset

In [37]:
import torch
from torch.utils.data import Dataset

class MultimodalDataset(torch.utils.data.Dataset):
    def __init__(self, image_feats, text_tokens, labels):
        self.img_feats = torch.tensor(image_feats, dtype=torch.float32)
        self.input_ids = text_tokens['input_ids']
        self.attn_mask = text_tokens['attention_mask']
        self.labels = torch.tensor(labels-1, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'image': self.img_feats[idx],
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attn_mask[idx],
            'label': self.labels[idx]
        }


Split Dataset and Create Dataloaders

In [38]:
# Filter labels to match valid image-caption pairs
labels = np.array(labels)  # Make sure it's a NumPy array
filtered_labels = labels[[i for i, img in enumerate(image_filenames) if img in caption_map]]

# Now assert that all match in length
assert len(image_features) == len(caption_texts) == len(filtered_labels)

# Continue training split
train_idx, test_idx = train_test_split(range(len(filtered_labels)), test_size=0.2, random_state=42)

train_data = MultimodalDataset(
    image_features[train_idx],
    {k: v[train_idx] for k, v in tokenized.items()},
    filtered_labels[train_idx]
)

test_data = MultimodalDataset(
    image_features[test_idx],
    {k: v[test_idx] for k, v in tokenized.items()},
    filtered_labels[test_idx]
)


In [39]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)


Define Vision-Language Classifier

In [40]:
import torch.nn as nn
from transformers import BertModel

class FusionClassifier(nn.Module):
    def __init__(self, num_classes):
        super(FusionClassifier, self).__init__()
        self.text_encoder = BertModel.from_pretrained('bert-base-uncased')
        self.img_proj = nn.Linear(2048, 768)
        self.classifier = nn.Sequential(
            nn.Linear(768 * 2, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, image, input_ids, attention_mask):
        text_out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        img_out = self.img_proj(image)
        combined = torch.cat((text_out, img_out), dim=1)
        return self.classifier(combined)

num_classes = len(np.unique(labels))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = FusionClassifier(num_classes).to(device)


Train the Model

In [42]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

optimizer = Adam(model.parameters(), lr=1e-4)
loss_fn = CrossEntropyLoss()

for epoch in range(5):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader)
    for batch in loop:
        for k in batch:
            batch[k] = batch[k].to(device)

        optimizer.zero_grad()
        outputs = model(batch['image'], batch['input_ids'], batch['attention_mask'])
        loss = loss_fn(outputs, batch['label'])
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_description(f"Epoch [{epoch+1}/5]")
        loop.set_postfix(loss=loss.item())


Epoch [1/5]: 100%|██████████| 105/105 [19:05<00:00, 10.91s/it, loss=0.00358]
Epoch [2/5]: 100%|██████████| 105/105 [18:45<00:00, 10.72s/it, loss=0.125]
Epoch [3/5]: 100%|██████████| 105/105 [18:52<00:00, 10.78s/it, loss=0.000645]
Epoch [4/5]: 100%|██████████| 105/105 [18:44<00:00, 10.71s/it, loss=0.00389]
Epoch [5/5]: 100%|██████████| 105/105 [18:44<00:00, 10.71s/it, loss=0.0015]


Evaluate Model Accuracy

In [43]:
from sklearn.metrics import accuracy_score

model.eval()
preds = []
targets = []

with torch.no_grad():
    for batch in test_loader:
        for k in batch:
            batch[k] = batch[k].to(device)

        outputs = model(batch['image'], batch['input_ids'], batch['attention_mask'])
        predicted = torch.argmax(outputs, dim=1).cpu().numpy()
        targets.extend(batch['label'].cpu().numpy())
        preds.extend(predicted)

acc = accuracy_score(targets, preds)
print(f"✅ Test Accuracy: {acc:.4f}")


✅ Test Accuracy: 0.9928


In [44]:
torch.save(model.state_dict(), 'fusion_classifier.pth')


Predict Diagnosis from Image + Text

In [46]:
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D


InceptionV3 Feature Extractor (Keras)

In [47]:
# Load InceptionV3 without top layer
base_model = InceptionV3(weights='imagenet', include_top=False, input_shape=(299, 299, 3))
output = GlobalAveragePooling2D()(base_model.output)
inception_model = Model(inputs=base_model.input, outputs=output)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m87910968/87910968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


BERT Tokenizer

In [72]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


FusionClassifier (PyTorch)

In [73]:
model = FusionClassifier(num_classes)
model.load_state_dict(torch.load('fusion_classifier.pth'))
model.to(device)
model.eval()


FusionClassifier(
  (text_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

 Extract Image Feature

In [74]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import preprocess_input

def extract_image_feature(img_path):
    img = image.load_img(img_path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)

    features = inception_model.predict(x)
    return features[0]  # shape: (2048,)

Prediction Function

In [75]:
import torch.nn.functional as F

def predict_diagnosis(img_path, caption, model):
    # Extract image features
    img_feat = extract_image_feature(img_path)
    img_tensor = torch.tensor(img_feat, dtype=torch.float32).unsqueeze(0).to(device)  # shape: [1, 2048]

    # Tokenize text
    tokens = tokenizer(caption, padding='max_length', truncation=True, max_length=50, return_tensors='pt')
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)

    # Predict
    with torch.no_grad():
        logits = model(img_tensor, input_ids, attention_mask)
        probs = F.softmax(logits, dim=1)
        pred_class = torch.argmax(probs, dim=1).item()
        confidence = torch.max(probs).item()

    return pred_class, confidence

In [76]:
img_path = "/content/drive/MyDrive/Dataset/normal/2345_right.jpg"
caption = "I feel confident in my eyesight and have no concerns about vision loss"

pred_class, confidence = predict_diagnosis(img_path, caption, model)

diagnosis_map = {0: "glucoma", 1: "Cataract", 2: "Retinal Disease", 3: "Normal"}
print(f"🩺 Predicted Diagnosis: {diagnosis_map[pred_class]} (Confidence: {confidence:.2f})")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 401ms/step
🩺 Predicted Diagnosis: Normal (Confidence: 1.00)


In [77]:
img_path = "/content/drive/MyDrive/Dataset/Glaucoma/086.jpg"
caption = "My vision has become increasingly blurry and clouded, and I feel like Iâ€™m looking through a foggy window. Iâ€™m worried because bright lights cause halos and glare, making driving or walking at night dangerous."

pred_class, confidence = predict_diagnosis(img_path, caption, model)

diagnosis_map = {0: "glucoma", 1: "Cataract", 2: "Retinal Disease", 3: "Normal"}
print(f"🩺 Predicted Diagnosis: {diagnosis_map[pred_class]} (Confidence: {confidence:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 405ms/step
🩺 Predicted Diagnosis: glucoma (Confidence: 0.99)


In [78]:
img_path = "/content/drive/MyDrive/Dataset/cataract/2154_left.jpg"
caption = "When I step outside into bright sunlight, I have to shield my eyes because the glare is overwhelming. My vision often seems dull and muted, and I feel like Iâ€™m looking through a thick layer of fog."

pred_class, confidence = predict_diagnosis(img_path, caption, model)

diagnosis_map = {0: "glucoma", 1: "Cataract", 2: "Retinal Disease", 3: "Normal"}
print(f"🩺 Predicted Diagnosis: {diagnosis_map[pred_class]} (Confidence: {confidence:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 369ms/step
🩺 Predicted Diagnosis: Cataract (Confidence: 1.00)


In [79]:
img_path = "/content/drive/MyDrive/Dataset/diabetic_retinopathy/10031_right.jpeg"
caption = "I have trouble judging distances because my vision is blurry and inconsistent. This affects my balance and coordination."

pred_class, confidence = predict_diagnosis(img_path, caption, model)

diagnosis_map = {0: "glucoma", 1: "Cataract", 2: "Retinal Disease", 3: "Normal"}
print(f"🩺 Predicted Diagnosis: {diagnosis_map[pred_class]} (Confidence: {confidence:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 387ms/step
🩺 Predicted Diagnosis: Retinal Disease (Confidence: 1.00)


In [80]:
img_path = "/content/drive/MyDrive/Dataset/normal/2334_right.jpg"
caption = "I can comfortably engage in visually demanding tasks without strain."

pred_class, confidence = predict_diagnosis(img_path, caption, model)

diagnosis_map = {0: "glucoma", 1: "Cataract", 2: "Retinal Disease", 3: "Normal"}
print(f"🩺 Predicted Diagnosis: {diagnosis_map[pred_class]} (Confidence: {confidence:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 390ms/step
🩺 Predicted Diagnosis: Normal (Confidence: 1.00)
