<a href="https://colab.research.google.com/github/TimHBSWFL/UCSD-ML-Capstone/blob/main/baseline_image_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
import os
import torch
import numpy as np
import pandas as pd
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
directory = '/content/drive/My Drive/Capstone Data Collection/Image Datasets'

dataframes = {}

for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        df = pd.read_csv(os.path.join(directory, filename))
        dataframes[filename] = df

for key, df in dataframes.items():
    print(f"{key}: {df.shape}")

Images_to_Text_Results.csv: (150, 3)
highly_rated.csv: (3760, 21)
middle_rated.csv: (918, 21)
df_upscale_photos.csv: (796, 18)
lower_rated.csv: (333, 21)


In [8]:
df_captions = dataframes['Images_to_Text_Results.csv']
df_captions.head()

Unnamed: 0,photo_id,model_name,caption
0,Ax5PLwfU94uEXMafFdXrtw.jpg,nlpconnect/vit-gpt2-image-captioning,a sandwich with meat and cheese on a cutting b...
1,Ax5PLwfU94uEXMafFdXrtw.jpg,Salesforce/blip-image-captioning-large,a close up of a sandwich on a piece of paper o...
2,Ax5PLwfU94uEXMafFdXrtw.jpg,Salesforce/blip-image-captioning-base,"a hamburger with cheese, bacon and cheese on it"
3,bFNqVruIW3AXjgSuLHq4kg.jpg,nlpconnect/vit-gpt2-image-captioning,a person holding a sandwich in their hand
4,bFNqVruIW3AXjgSuLHq4kg.jpg,Salesforce/blip-image-captioning-large,someone is grabbing a sausage patty out of a b...


In [9]:
df_list = ['highly_rated.csv', 'middle_rated.csv', 'lower_rated.csv', 'df_upscale_photos.csv']

df_photos = []

for i in df_list:
  if i in dataframes:
    df = dataframes[i]
    df_photos.append(df)

df_concat = pd.concat(df_photos, ignore_index=True)
df_concat.shape

(5807, 21)

In [23]:
df_sliced = df_concat[['photo_id', 'label']]
df_sliced.head()

Unnamed: 0,photo_id,label
0,n6pTHg6JLgnJYHuLKwhLfw,fast food
1,UFi0lSnl8ebMtV29e3CcLg,fast food
2,4hvq-NAWZi6P1shkrzwEmQ,fast food
3,E0z6fJvqIrSR7mWSpG0opA,fast food
4,2QeWFyZjy9B5PSEVbzDkqg,fast food


In [24]:
df_captions['photo_id'] = df_captions['photo_id'].str.replace('.jpg', '', regex=False)

In [25]:
df_merged = df_captions.merge(df_sliced, how='left', on='photo_id')
df_merged.shape

(150, 4)

In [26]:
df_merged.head()

Unnamed: 0,photo_id,model_name,caption,label
0,Ax5PLwfU94uEXMafFdXrtw,nlpconnect/vit-gpt2-image-captioning,a sandwich with meat and cheese on a cutting b...,fast food
1,Ax5PLwfU94uEXMafFdXrtw,Salesforce/blip-image-captioning-large,a close up of a sandwich on a piece of paper o...,fast food
2,Ax5PLwfU94uEXMafFdXrtw,Salesforce/blip-image-captioning-base,"a hamburger with cheese, bacon and cheese on it",fast food
3,bFNqVruIW3AXjgSuLHq4kg,nlpconnect/vit-gpt2-image-captioning,a person holding a sandwich in their hand,fast food
4,bFNqVruIW3AXjgSuLHq4kg,Salesforce/blip-image-captioning-large,someone is grabbing a sausage patty out of a b...,fast food


In [27]:
df_merged['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
fast food,75
fine dining,75


In [28]:
df_merged['photo_id'] = df_merged['photo_id'] + '.jpg'
df_merged.head()

Unnamed: 0,photo_id,model_name,caption,label
0,Ax5PLwfU94uEXMafFdXrtw.jpg,nlpconnect/vit-gpt2-image-captioning,a sandwich with meat and cheese on a cutting b...,fast food
1,Ax5PLwfU94uEXMafFdXrtw.jpg,Salesforce/blip-image-captioning-large,a close up of a sandwich on a piece of paper o...,fast food
2,Ax5PLwfU94uEXMafFdXrtw.jpg,Salesforce/blip-image-captioning-base,"a hamburger with cheese, bacon and cheese on it",fast food
3,bFNqVruIW3AXjgSuLHq4kg.jpg,nlpconnect/vit-gpt2-image-captioning,a person holding a sandwich in their hand,fast food
4,bFNqVruIW3AXjgSuLHq4kg.jpg,Salesforce/blip-image-captioning-large,someone is grabbing a sausage patty out of a b...,fast food


In [30]:
image_dir = "/content/drive/My Drive/Capstone Data Collection/test photos"

image_paths = []

for subfolder in ['fast food', 'upscale']:
    subfolder_path = os.path.join(image_dir, subfolder)

    image_paths.extend([os.path.join(subfolder_path, img) for img in os.listdir(subfolder_path) if img.endswith(".jpg")])

print(len(image_paths))

50


In [35]:
df_merged.head()

Unnamed: 0,photo_id,model_name,caption,label
0,Ax5PLwfU94uEXMafFdXrtw.jpg,nlpconnect/vit-gpt2-image-captioning,a sandwich with meat and cheese on a cutting b...,fast food
1,Ax5PLwfU94uEXMafFdXrtw.jpg,Salesforce/blip-image-captioning-large,a close up of a sandwich on a piece of paper o...,fast food
2,Ax5PLwfU94uEXMafFdXrtw.jpg,Salesforce/blip-image-captioning-base,"a hamburger with cheese, bacon and cheese on it",fast food
3,bFNqVruIW3AXjgSuLHq4kg.jpg,nlpconnect/vit-gpt2-image-captioning,a person holding a sandwich in their hand,fast food
4,bFNqVruIW3AXjgSuLHq4kg.jpg,Salesforce/blip-image-captioning-large,someone is grabbing a sausage patty out of a b...,fast food


In [37]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

features = []
labels = []

for idx, row in df_merged.iterrows():
    target_filename = row["photo_id"] if row["photo_id"].endswith(".jpg") else f"{row['photo_id']}.jpg"

    image_path = next((path for path in image_paths if os.path.basename(path) == target_filename), None)

    if image_path:
        try:
            image = Image.open(image_path).convert("RGB")
            caption = row["caption"]

            inputs = clip_processor(text=[caption], images=image, return_tensors="pt", padding=True).to(device)
            outputs = clip_model(**inputs)
            text_features = outputs.text_embeds
            image_features = outputs.image_embeds

            features.append((text_features.cpu().detach().numpy().flatten(), image_features.cpu().detach().numpy().flatten()))
            labels.append(1 if row["label"] == "fine dining" else 0)

        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
    else:
        print(f"Warning: Image {row['photo_id']}.jpg not found in any provided directories.")

X = pd.DataFrame(features)
y = pd.Series(labels)



In [42]:
flattened_features = [np.concatenate((text_feat, image_feat)) for text_feat, image_feat in features]
X = np.array(flattened_features)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=["Fast Food", "Fine Dining"]))


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

   Fast Food       1.00      1.00      1.00        16
 Fine Dining       1.00      1.00      1.00        14

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

