In [3]:
pip install s3fs

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import io
import boto3
import sagemaker
import torch
from torch import nn
import numpy as np
import datetime as dt
from io import BytesIO
from sklearn.metrics import accuracy_score
from transformers import CLIPProcessor, CLIPModel
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from PIL import Image

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


2024-11-26 04:00:19.805303: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-26 04:00:19.825571: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-26 04:00:19.832088: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-26 04:00:19.847135: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
s3 = boto3.client('s3')

bucket_name = "sagemaker-studio-619071335465-8h7owh9eftx"
main_image_dir = 'training/image classification/'

response = s3.list_objects_v2(Bucket=bucket_name, Prefix=main_image_dir)

csv_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.csv')]

csv_files

['training/image classification/df_upscale_photos.csv',
 'training/image classification/final_image_to_text_results.csv',
 'training/image classification/highly_rated.csv',
 'training/image classification/lower_rated.csv',
 'training/image classification/middle_rated.csv']

In [3]:
excluded_file = 'training/image classification/final_image_to_text_results.csv'
csv_files_to_append = [file for file in csv_files if file != excluded_file]

appended_dataframes = []

for file_key in csv_files_to_append:
    file_path = f"s3://{bucket_name}/{file_key}"
    df = pd.read_csv(file_path)
    appended_dataframes.append(df)

df_concat = pd.concat(appended_dataframes, ignore_index=True)

excluded_file_path = f"s3://{bucket_name}/{excluded_file}"
df_captions = pd.read_csv(excluded_file_path)


print(f"Appended DataFrame shape: {df_concat.shape}")
print(f"Excluded 'final_image_to_text_results.csv' shape: {df_captions.shape}")

Appended DataFrame shape: (5807, 21)
Excluded 'final_image_to_text_results.csv' shape: (17421, 3)


In [4]:
df_sliced = df_concat[['photo_id', 'label']]
df_sliced.head()

Unnamed: 0,photo_id,label
0,7sVrf-VF50HGES_h8OQ46A,fine dining
1,1Zh80DfJ5okYm2S8wxePUA,fine dining
2,jBEueCghl2S_bFDuVux1lA,fine dining
3,NO3puIMIwXjNbstSXXXh0A,fine dining
4,K5wG2QbekQxLd8VmE0Hu2A,fine dining


In [5]:
df_captions.head()

Unnamed: 0,photo_id,model_name,caption
0,-0CTxYw82SWnJfzPOBBIOQ.jpg,nlpconnect/vit-gpt2-image-captioning,a refrigerator with a picture of a pizza on it
1,-0CTxYw82SWnJfzPOBBIOQ.jpg,Salesforce/blip-image-captioning-large,there is a large salad bar with a bunch of veg...
2,-0CTxYw82SWnJfzPOBBIOQ.jpg,Salesforce/blip-image-captioning-base,a kitchen with a large sign above it
3,-0fa0mOVKrJW90MFFxVImg.jpg,nlpconnect/vit-gpt2-image-captioning,a hot dog with mustard and ketchup on a bun
4,-0fa0mOVKrJW90MFFxVImg.jpg,Salesforce/blip-image-captioning-large,araffe with a pickle and a side of french fries


In [6]:
df_captions['photo_id'] = df_captions['photo_id'].str.replace('.jpg', '', regex=False)

df_merged = df_captions.merge(df_sliced, how='left', on='photo_id')
df_merged.shape

(17421, 4)

In [7]:
df_merged['photo_id'] = df_merged['photo_id'] + '.jpg'

df_merged['label'].value_counts()

label
fast food      15033
fine dining     2388
Name: count, dtype: int64

In [8]:
image_dir = 'image datasets/'

def get_all_images(bucket, prefix):
    continuation_token = None
    image_keys = []

    while True:
        list_params = {
            'Bucket': bucket,
            'Prefix': prefix,
        }
        if continuation_token:
            list_params['ContinuationToken'] = continuation_token

        response = s3.list_objects_v2(**list_params)

        if 'Contents' in response:
            for obj in response['Contents']:
                key = obj['Key']
                if key.endswith('.jpg'):
                    image_keys.append(key)

        if not response.get('IsTruncated'):
            break

        continuation_token = response.get('NextContinuationToken')

    return image_keys


image_paths = get_all_images(bucket_name, image_dir)


if len(image_paths) > 0:
    print(f"Total images found: {len(image_paths)}")
    print("Example image paths:")
    for path in image_paths[:5]:
        print(path)

Total images found: 5807
Example image paths:
image datasets/fastfood images/Fastfood Images/Highly Rated/-0CTxYw82SWnJfzPOBBIOQ.jpg
image datasets/fastfood images/Fastfood Images/Highly Rated/-0fa0mOVKrJW90MFFxVImg.jpg
image datasets/fastfood images/Fastfood Images/Highly Rated/-4PTjFxdyR-tkxDhVeuAfQ.jpg
image datasets/fastfood images/Fastfood Images/Highly Rated/-6wM47iMcw_wjW3gZYaz-g.jpg
image datasets/fastfood images/Fastfood Images/Highly Rated/-7Z1mIroHNK6IJKHMLfnJg.jpg


In [9]:
df_merged['labels'] = df_merged['label'].map({"fast food": 0, "fine dining": 1})
df_merged.head()

Unnamed: 0,photo_id,model_name,caption,label,labels
0,-0CTxYw82SWnJfzPOBBIOQ.jpg,nlpconnect/vit-gpt2-image-captioning,a refrigerator with a picture of a pizza on it,fast food,0
1,-0CTxYw82SWnJfzPOBBIOQ.jpg,Salesforce/blip-image-captioning-large,there is a large salad bar with a bunch of veg...,fast food,0
2,-0CTxYw82SWnJfzPOBBIOQ.jpg,Salesforce/blip-image-captioning-base,a kitchen with a large sign above it,fast food,0
3,-0fa0mOVKrJW90MFFxVImg.jpg,nlpconnect/vit-gpt2-image-captioning,a hot dog with mustard and ketchup on a bun,fast food,0
4,-0fa0mOVKrJW90MFFxVImg.jpg,Salesforce/blip-image-captioning-large,araffe with a pickle and a side of french fries,fast food,0


In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

In [11]:
# df_sample = df_merged.sample(frac=0.0575, random_state=42)
# df_sample.shape

df_merged.shape

(17421, 5)

In [12]:

image_features = []
text_features = []
labels = []


def fetch_image_from_s3(bucket, key):
    response = s3.get_object(Bucket=bucket, Key=key)
    image_bytes = response['Body'].read()
    return Image.open(BytesIO(image_bytes)).convert("RGB")


for idx, row in df_merged.iterrows():
  target_filename = row["photo_id"] if row["photo_id"].endswith(".jpg") else f"{row['photo_id']}.jpg"
  image_key = next((path for path in image_paths if os.path.basename(path) == target_filename), None)

  if image_key:
    image = fetch_image_from_s3(bucket_name, image_key)
    caption = row["caption"]
    label = row["labels"]

    inputs = clip_processor(text=[caption], images=image, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
      image_embedding = clip_model.get_image_features(pixel_values=inputs['pixel_values']).squeeze().cpu()
      text_embedding = clip_model.get_text_features(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']).squeeze().cpu()

    image_features.append(image_embedding)
    text_features.append(text_embedding)
    labels.append(label)

  else:
    print(f"Image {target_filename} not found in S3.")


image_features = torch.stack(image_features)
text_features = torch.stack(text_features)
labels = torch.tensor(labels)

print(f"Processed {len(image_features)} images and text features.")

Processed 17421 images and text features.


In [13]:
class MultimodalClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout_rate):
        super(MultimodalClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim * 2, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, image_embedding, text_embedding):
        x = torch.cat((image_embedding, text_embedding), dim=1)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.sigmoid(self.fc2(x))
        return x


In [14]:
best_params = {'learning_rate': 0.000122235845833759, 'batch_size': 8, 'epochs': 15, 'weight_decay': 0.064616307498029, 'dropout_rate': 0.45584099758281393}

In [15]:
for k, v in best_params.items():
    print(f"{k}: {v}")

learning_rate = best_params['learning_rate']
batch_size = best_params['batch_size']
epochs = best_params['epochs']
weight_decay = best_params['weight_decay']
dropout_rate = best_params['dropout_rate']

learning_rate: 0.000122235845833759
batch_size: 8
epochs: 15
weight_decay: 0.064616307498029
dropout_rate: 0.45584099758281393


In [16]:
X_train, X_val, y_train, y_val = train_test_split(image_features, labels, test_size=0.2, random_state=42)
text_train, text_val = train_test_split(text_features, test_size=0.2, random_state=42)

train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(text_train), torch.tensor(y_train))
val_dataset = TensorDataset(torch.tensor(X_val), torch.tensor(text_val), torch.tensor(y_val))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

input_dim = image_features.shape[1]
hidden_dim = 256
model = MultimodalClassifier(input_dim=input_dim, hidden_dim=hidden_dim, dropout_rate=dropout_rate).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = nn.BCELoss()


for epoch in range(epochs):
    print(f"Epoch {epoch}")

    model.train()
    for i, (image_batch, text_batch, label_batch) in enumerate(train_loader, start=1):
        image_batch = image_batch.to(device)
        text_batch = text_batch.to(device)
        label_batch = label_batch.to(device)

        optimizer.zero_grad()
        output = model(image_batch, text_batch).squeeze(-1)
        loss = criterion(output, label_batch.float())
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print(f"Images Processed {i}: Loss = {loss.item():.4f}")
        

model.eval()
val_predictions = []
val_labels = []

with torch.no_grad():
    for image_batch, text_batch, label_batch in val_loader:
        image_batch = image_batch.to(device)
        text_batch = text_batch.to(device)
        label_batch = label_batch.to(device)

        output = model(image_batch, text_batch)
        val_predictions.extend(output.cpu().numpy())
        val_labels.extend(label_batch.cpu().numpy())

val_predictions = np.array(val_predictions) > 0.5
val_acc = accuracy_score(val_labels, val_predictions)

print(f"Validation Accuracy: {val_acc}")

  train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(text_train), torch.tensor(y_train))
  val_dataset = TensorDataset(torch.tensor(X_val), torch.tensor(text_val), torch.tensor(y_val))


Epoch 0
Images Processed 100: Loss = 0.3494
Images Processed 200: Loss = 0.1392
Images Processed 300: Loss = 0.1646
Images Processed 400: Loss = 0.2167
Images Processed 500: Loss = 0.1347
Images Processed 600: Loss = 0.1806
Images Processed 700: Loss = 0.2010
Images Processed 800: Loss = 0.1603
Images Processed 900: Loss = 0.0959
Images Processed 1000: Loss = 0.1482
Images Processed 1100: Loss = 0.1151
Images Processed 1200: Loss = 0.2528
Images Processed 1300: Loss = 0.1382
Images Processed 1400: Loss = 0.0884
Images Processed 1500: Loss = 0.2391
Images Processed 1600: Loss = 0.0771
Images Processed 1700: Loss = 0.2816
Epoch 1
Images Processed 100: Loss = 0.1236
Images Processed 200: Loss = 0.1021
Images Processed 300: Loss = 0.3126
Images Processed 400: Loss = 0.1000
Images Processed 500: Loss = 0.3029
Images Processed 600: Loss = 0.0307
Images Processed 700: Loss = 0.2066
Images Processed 800: Loss = 0.1394
Images Processed 900: Loss = 0.1208
Images Processed 1000: Loss = 0.0819
Ima

In [22]:
model_save_path = "multimodal_classifier_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model state saved to {model_save_path}")


Model state saved to multimodal_classifier_model.pth


In [23]:

s3_bucket = "sagemaker-studio-619071335465-8h7owh9eftx"
s3_key = "training/models/multimodal_classifier_model.pth"
s3 = boto3.client("s3")

s3.upload_file(model_save_path, s3_bucket, s3_key)
print(f"Model uploaded to s3://{s3_bucket}/{s3_key}")


Model uploaded to s3://sagemaker-studio-619071335465-8h7owh9eftx/training/models/multimodal_classifier_model.pth


In [24]:
model_artifact = f"s3://{s3_bucket}/{s3_key}"
print(model_artifact)

s3://sagemaker-studio-619071335465-8h7owh9eftx/training/models/multimodal_classifier_model.pth


In [25]:
model_path = 'multimodal_classifier_model.pth'
state_dict = torch.load(model_path)
print(state_dict)

OrderedDict([('fc1.weight', tensor([[ 4.1057e-03,  8.4955e-04, -3.3628e-03,  ..., -2.6028e-03,
          7.6136e-04, -2.7815e-04],
        [-2.7187e-03, -1.8087e-03,  2.2379e-03,  ...,  4.7140e-03,
          1.0096e-03,  1.6289e-04],
        [ 3.5197e-03,  5.2734e-03, -6.2515e-03,  ..., -2.2891e-03,
         -1.6100e-03, -2.6531e-03],
        ...,
        [ 5.7256e-03,  3.6784e-03, -5.2710e-03,  ..., -3.3767e-03,
         -1.5594e-04, -2.4984e-03],
        [-3.9317e-03, -5.1555e-05,  4.2796e-03,  ...,  2.0974e-03,
         -7.9344e-04,  6.9403e-04],
        [ 9.2486e-44, -8.5479e-44, -7.9874e-44,  ..., -4.6243e-44,
          8.8282e-44,  9.6690e-44]], device='cuda:0')), ('fc1.bias', tensor([ 6.6227e-03, -5.1327e-04,  6.2926e-03,  2.0024e-20,  4.5280e-03,
         5.4196e-03,  6.8448e-03, -2.5870e-17,  6.0964e-03,  3.3327e-03,
         5.8031e-03, -1.2752e-03,  4.5492e-03, -1.6060e-03,  1.1419e-05,
         5.7791e-03,  3.4429e-03,  4.7352e-03, -1.9316e-20, -3.1827e-04,
         5.1584e