<a href="https://colab.research.google.com/github/TimHBSWFL/UCSD-ML-Capstone/blob/main/deep_learning_image_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import torch
from torch import nn
from transformers import CLIPProcessor, CLIPModel
from sklearn.model_selection import train_test_split
import pandas as pd
from PIL import Image

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
directory = '/content/drive/My Drive/Capstone Data Collection/Image Datasets'

dataframes = {}

for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        df = pd.read_csv(os.path.join(directory, filename))
        dataframes[filename] = df

for key, df in dataframes.items():
    print(f"{key}: {df.shape}")

Images_to_Text_Results.csv: (150, 3)
df_upscale_photos.csv: (796, 18)
lower_rated.csv: (333, 21)
highly_rated.csv: (3760, 21)
middle_rated.csv: (918, 21)


In [4]:
df_captions = dataframes['Images_to_Text_Results.csv']

df_list = ['highly_rated.csv', 'middle_rated.csv', 'lower_rated.csv', 'df_upscale_photos.csv']

df_photos = []

for i in df_list:
  if i in dataframes:
    df = dataframes[i]
    df_photos.append(df)

df_concat = pd.concat(df_photos, ignore_index=True)
df_concat.shape

(5807, 21)

In [5]:
df_sliced = df_concat[['photo_id', 'label']]

df_captions['photo_id'] = df_captions['photo_id'].str.replace('.jpg', '', regex=False)

df_merged = df_captions.merge(df_sliced, how='left', on='photo_id')
df_merged.shape

(150, 4)

In [6]:
df_merged['photo_id'] = df_merged['photo_id'] + '.jpg'

df_merged['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
fast food,75
fine dining,75


In [7]:
image_dir = "/content/drive/My Drive/Capstone Data Collection/test photos"

image_paths = []

for subfolder in ['fast food', 'upscale']:
    subfolder_path = os.path.join(image_dir, subfolder)

    image_paths.extend([os.path.join(subfolder_path, img) for img in os.listdir(subfolder_path) if img.endswith(".jpg")])

print(len(image_paths))

50


In [8]:
df_merged['labels'] = df_merged['label'].map({"fast food": 0, "fine dining": 1})
df_merged.head()

Unnamed: 0,photo_id,model_name,caption,label,labels
0,Ax5PLwfU94uEXMafFdXrtw.jpg,nlpconnect/vit-gpt2-image-captioning,a sandwich with meat and cheese on a cutting b...,fast food,0
1,Ax5PLwfU94uEXMafFdXrtw.jpg,Salesforce/blip-image-captioning-large,a close up of a sandwich on a piece of paper o...,fast food,0
2,Ax5PLwfU94uEXMafFdXrtw.jpg,Salesforce/blip-image-captioning-base,"a hamburger with cheese, bacon and cheese on it",fast food,0
3,bFNqVruIW3AXjgSuLHq4kg.jpg,nlpconnect/vit-gpt2-image-captioning,a person holding a sandwich in their hand,fast food,0
4,bFNqVruIW3AXjgSuLHq4kg.jpg,Salesforce/blip-image-captioning-large,someone is grabbing a sausage patty out of a b...,fast food,0


In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



In [11]:
image_features = []
text_features = []
labels = []


for idx, row in df_merged.iterrows():
  target_filename = row["photo_id"] if row["photo_id"].endswith(".jpg") else f"{row['photo_id']}.jpg"
  image_path = next((path for path in image_paths if os.path.basename(path) == target_filename), None)

  image = Image.open(image_path).convert("RGB")
  caption = row["caption"]
  label = row["labels"]

  inputs = clip_processor(text=[caption], images=image, return_tensors="pt", padding=True).to(device)
  with torch.no_grad():
    image_embedding = clip_model.get_image_features(pixel_values=inputs['pixel_values']).squeeze().cpu()
    text_embedding = clip_model.get_text_features(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']).squeeze().cpu()

  image_features.append(image_embedding)
  text_features.append(text_embedding)
  labels.append(label)

image_features = torch.stack(image_features)
text_features = torch.stack(text_features)
labels = torch.tensor(labels)

In [12]:
class MultimodalClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(MultimodalClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim * 2, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, image_embedding, text_embedding):
        x = torch.cat((image_embedding, text_embedding), dim=1)
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

input_dim = image_features.shape[1]
hidden_dim = 256
model = MultimodalClassifier(input_dim, hidden_dim).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


In [17]:
train_image_feats, val_image_feats, train_text_feats, val_text_feats, train_labels, val_labels = train_test_split(
    image_features, text_features, labels, test_size=0.2, random_state=42)


epochs = 25
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    train_image_feats = train_image_feats.to(device)
    train_text_feats = train_text_feats.to(device)
    train_labels = train_labels.float().to(device)

    outputs = model(train_image_feats, train_text_feats).squeeze()
    loss = criterion(outputs, train_labels)

    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")


model.eval()
with torch.no_grad():
    val_image_feats = val_image_feats.to(device)
    val_text_feats = val_text_feats.to(device)
    val_labels = val_labels.float().to(device)

    val_outputs = model(val_image_feats, val_text_feats).squeeze()
    predictions = (val_outputs > 0.5).float()
    accuracy = (predictions == val_labels).sum().item() / val_labels.size(0)

print(f"Validation Accuracy: {accuracy * 100:.2f}%")


Epoch [1/25], Loss: 0.2493
Epoch [2/25], Loss: 0.2439
Epoch [3/25], Loss: 0.2385
Epoch [4/25], Loss: 0.2333
Epoch [5/25], Loss: 0.2282
Epoch [6/25], Loss: 0.2233
Epoch [7/25], Loss: 0.2184
Epoch [8/25], Loss: 0.2137
Epoch [9/25], Loss: 0.2091
Epoch [10/25], Loss: 0.2046
Epoch [11/25], Loss: 0.2002
Epoch [12/25], Loss: 0.1959
Epoch [13/25], Loss: 0.1917
Epoch [14/25], Loss: 0.1876
Epoch [15/25], Loss: 0.1836
Epoch [16/25], Loss: 0.1796
Epoch [17/25], Loss: 0.1758
Epoch [18/25], Loss: 0.1721
Epoch [19/25], Loss: 0.1685
Epoch [20/25], Loss: 0.1649
Epoch [21/25], Loss: 0.1615
Epoch [22/25], Loss: 0.1581
Epoch [23/25], Loss: 0.1548
Epoch [24/25], Loss: 0.1516
Epoch [25/25], Loss: 0.1485
Validation Accuracy: 100.00%
