In [6]:
import numpy as np
import tensorflow as tf
import pickle
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing import image
import os
import csv

### Step 1: Data Loading and Preprocessing Class
This class is designed to handle different dataset splits (train, validation, test), read data, extract image and text features, and prepare labels.

In [7]:
# Initialize VGG16 model for feature extraction
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
model = tf.keras.Model(inputs=base_model.input, outputs=base_model.output)

# Function to extract image features using VGG16
def extract_image_features(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array_expanded = np.expand_dims(img_array, axis=0)
    img_preprocessed = preprocess_input(img_array_expanded)
    features = model.predict(img_preprocessed)
    return features.flatten()

# Load sentence embeddings
def load_sentence_embeddings(sentence_embeddings_file):
    sentence_embeddings = {}
    print("READING sentence embeddings...")
    with open(sentence_embeddings_file, 'rb') as f:
        data = pickle.load(f)
        for sentence, dense_vector in data.items():
            sentence_embeddings[sentence] = dense_vector
    print("Done reading sentence_embeddings!")
    return sentence_embeddings


# Prepare dataset
def prepare_dataset(data_files, sentence_embeddings, images_path):
    image_features = []
    text_features = []
    labels = []
    
    with open(data_files) as f:
        lines = f.readlines()
        for line in lines:
            img_name, text, raw_label = line.strip().split("\t")
            img_path = os.path.join(images_path, img_name)
            
            # Extract image features
            img_feat = extract_image_features(img_path)
            
            # Get text features (sentence embeddings)
            txt_feat = sentence_embeddings.get(text, np.zeros(384))  # Adjust size as per your embeddings
            
            # Combine image and text features immediately
            combined_feat = np.concatenate((img_feat, txt_feat))
            
            # Prepare labels
            label = 1 if raw_label == "match" else 0
            
            image_features.append(combined_feat)
            labels.append(label)
    
    # Since image_features now contains combined image and text features, 
    # you don't need a separate return for text_features
    return np.array(image_features), np.array(labels)


### Step 2: Model Training and Evaluation
Now, we can use the DatasetLoader class to load the train, validation, and test datasets, extract features, and prepare the data for training and evaluating a model. Here's an example of training an SVM classifier:

# Main execution starts here
data_path = "/home/rinzler/Github/Image-Text-Matching/data"
images_path = os.path.join(data_path, "images")
train_data_file = os.path.join(data_path, "flickr8k.TrainImages.txt")
test_data_file = os.path.join(data_path, "flickr8k.TestImages.txt")
dev_data_file = os.path.join(data_path, "flickr8k.DevImages.txt")
sentence_embeddings_file = os.path.join(data_path, "flickr8k.cmp9137.sentence_transformers.pkl")

sentence_embeddings = load_sentence_embeddings(sentence_embeddings_file)

# Load and prepare each dataset
X_train, y_train = prepare_dataset(train_data_file, sentence_embeddings, images_path)
X_test, y_test = prepare_dataset(test_data_file, sentence_embeddings, images_path)
X_val, y_val = prepare_dataset(dev_data_file, sentence_embeddings, images_path)

# Train SVM classifier on training data
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Evaluate classifier on test data
predictions = svm_classifier.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, predictions))
print("Test Classification Report:\n", classification_report(y_test, predictions))

# Optionally, evaluate on validation data
val_predictions = svm_classifier.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_predictions))
print("Validation Classification Report:\n", classification_report(y_val, val_predictions))


In [8]:
from sklearn.model_selection import train_test_split

# Function to sample a subset of the dataset
def sample_dataset(X, y, sample_size=0.01):
    """
    This function samples a subset of the dataset.
    
    Parameters:
    - X: Features
    - y: Labels
    - sample_size: The fraction of the dataset to sample (0.1 for 10%)
    
    Returns:
    - X_sample: Sampled features
    - y_sample: Sampled labels
    """
    X_sample, _, y_sample, _ = train_test_split(X, y, test_size=1-sample_size, random_state=42)
    return X_sample, y_sample

# Main execution starts here
data_path = "/home/rinzler/Github/Image-Text-Matching/data"
images_path = os.path.join(data_path, "images")
train_data_file = os.path.join(data_path, "flickr8k.TrainImages.txt")
test_data_file = os.path.join(data_path, "flickr8k.TestImages.txt")
dev_data_file = os.path.join(data_path, "flickr8k.DevImages.txt")
sentence_embeddings_file = os.path.join(data_path, "flickr8k.cmp9137.sentence_transformers.pkl")

sentence_embeddings = load_sentence_embeddings(sentence_embeddings_file)

# Load, prepare, and sample each dataset
X_train_full, y_train_full = prepare_dataset(train_data_file, sentence_embeddings, images_path)
X_train, y_train = sample_dataset(X_train_full, y_train_full, sample_size=0.01)

X_test_full, y_test_full = prepare_dataset(test_data_file, sentence_embeddings, images_path)
X_test, y_test = sample_dataset(X_test_full, y_test_full, sample_size=0.01)

X_val_full, y_val_full = prepare_dataset(dev_data_file, sentence_embeddings, images_path)
X_val, y_val = sample_dataset(X_val_full, y_val_full, sample_size=0.01)

# Continue with training and evaluation as before
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Evaluate classifier on test data
predictions = svm_classifier.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, predictions))
print("Test Classification Report:\n", classification_report(y_test, predictions))

# Optionally, evaluate on validation data
val_predictions = svm_classifier.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_predictions))
print("Validation Classification Report:\n", classification_report(y_val, val_predictions))


READING sentence embeddings...
Done reading sentence_embeddings!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

KeyboardInterrupt: 