In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Compose, Resize, Normalize, ToTensor
from pathlib import Path
import random
from collections import defaultdict
import matplotlib.pyplot as plt
from torchvision.transforms.functional import to_pil_image
import os

class Flickr8kDataset(Dataset):
    def __init__(self, captions_file, img_dir, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        self.img_captions = defaultdict(list)  # Use defaultdict to automatically handle lists of captions

        # Read the .txt file
        with open(captions_file, 'r') as file:
            for line in file:
                parts = line.strip().split(',', 1)  # Split on the first comma
                if len(parts) == 2:
                    img_name, caption = parts
                    self.img_captions[img_name].append(caption)

        # Convert img_captions keys to a list to index images
        self.img_names = list(self.img_captions.keys())
        print(self.img_captions.items())

    def __len__(self):
        return len(self.img_names)

    def __getitem__(self, idx):
        img_name = self.img_names[idx]
        captions = self.img_captions[img_name]  # Get all captions for the image
        print(captions)
        img_path = os.path.join(self.img_dir, img_name)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image, captions


In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline
import json


# Load the BLIP model and processor
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

# Initialize the GPT-2-based image-to-text pipeline
gpt2_caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")


# Function to generate captions using BLIP
def generate_caption_with_blip(image_pil, processor, model):
    inputs = processor(images=image_pil, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=128, num_beams=4, return_dict_in_generate=True)
    caption = processor.decode(outputs.sequences[0], skip_special_tokens=True)
    return caption

# Function to generate captions using the GPT-2-based pipeline
def generate_caption_with_gpt2(image_pil):
    result = gpt2_caption_pipeline(image_pil)
    generated_caption = result[0]['generated_text'] if result else "Caption generation failed"
    return generated_caption


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]



In [None]:
import torch
import numpy as np
import torchvision.transforms as transforms
from torchvision.models import resnet50
from torch.utils.data import DataLoader
import json
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the pretrained model for feature extraction
model = resnet50(pretrained=True)
model.eval()

# Define transformation
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Initialize your dataset and DataLoader
dataset = Flickr8kDataset('/content/drive/My Drive/MasterThesis/flickr8k_dataset/captions.txt', '/content/drive/My Drive/MasterThesis/flickr8k_dataset/Images', transform)
data_loader = DataLoader(dataset, batch_size=1, shuffle=False)

features = []

i=0
for images, original_captions in data_loader:
    with torch.no_grad():
        image_features = model(images).numpy().flatten()
    features.append(image_features)
    i+=1
    print(i)
    if i==1100:
      break

# Convert features and labels to NumPy arrays
features = np.array(features)


Output hidden; open in https://colab.research.google.com to view.

In [None]:
import json

labels = []

# Define the path to your JSON file
file_path = '/content/drive/My Drive/MasterThesis/flickr8k_dataset/combined_captions_data_flickr_final.json'

def calculate_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

def calculate_average_cosine_similarity(generated_caption, original_captions):
    similarities = [calculate_cosine_similarity(generated_caption, original) for original in original_captions]
    return sum(similarities) / len(similarities)

# Open and load the JSON data
with open(file_path, 'r') as file:
    data = json.load(file)

for key, item in data.items():

    blip_avg_cos_sim = calculate_average_cosine_similarity(item['generated_captions']['blip'], item['original_coco_captions'])
    gpt2_avg_cos_sim = calculate_average_cosine_similarity(item['generated_captions']['gpt2'], item['original_coco_captions'])

    label = 1 if blip_avg_cos_sim > gpt2_avg_cos_sim else 0
    labels.append(label)

labels = np.array(labels)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Define the model
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32)


# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Accuracy: 0.6045


In [None]:
model.save('/content/drive/MyDrive/MasterThesis/model_ann_flickr.h5')

  saving_api.save_model(


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming features and labels are already defined
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))


Test Accuracy: 0.6591
              precision    recall  f1-score   support

           0       0.30      0.04      0.07        71
           1       0.68      0.95      0.79       149

    accuracy                           0.66       220
   macro avg       0.49      0.50      0.43       220
weighted avg       0.55      0.66      0.56       220



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# SVMs require feature scaling for optimal performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear', C=1.0, random_state=42)  # You can experiment with different kernels like 'rbf'

# Train the classifier
svm_classifier.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = svm_classifier.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))



Test Accuracy: 0.6227
              precision    recall  f1-score   support

           0       0.41      0.38      0.39        71
           1       0.71      0.74      0.73       149

    accuracy                           0.62       220
   macro avg       0.56      0.56      0.56       220
weighted avg       0.62      0.62      0.62       220



In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [1, 0.1, 0.01],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

grid_search = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=5)
grid_search.fit(X_train_scaled, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
best_svm_model = grid_search.best_estimator_

# Evaluate the best model found by grid search
y_pred_best = best_svm_model.predict(X_test_scaled)
print(f"Test Accuracy (Best Model): {accuracy_score(y_test, y_pred_best):.4f}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.4s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   0.2s
[CV] END .....................C=0.1, gamma=1, k