<a href="https://colab.research.google.com/github/SumaiyaZohaRODELA/Inception_Train_Test/blob/main/inception_train_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install faiss-gpu

import os
from glob import glob
from PIL import Image
import numpy as np
import faiss
import torch
import torchvision.transforms as transforms
from torchvision.models import inception_v3
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split

# Download and unzip dataset
!wget -O data.zip "https://drive.google.com/uc?export=download&id=1q3dpti5aX4LdD3Mq7bZ4rjTeZbQEljiy"
!unzip "data.zip"

# Folder containing images
image_folder = '/content/image_dataset'

# Get a list of all image files in the folder
image_files = glob(os.path.join(image_folder, '*.jpg'))

# Split the dataset into train and test sets
train_files, test_files = train_test_split(image_files, test_size=0.2, random_state=42)
print(f"Train images: {len(train_files)}, Test images: {len(test_files)}")


# Function to generate embeddings using Inception model
def generate_inception_embeddings(image_paths, model, preprocess, device):
    # Set model to evaluation mode
    model.eval()

    embeddings = []
    with torch.no_grad():
        for img_path in image_paths:
            image = Image.open(img_path).convert('RGB')
            image = preprocess(image).unsqueeze(0).to(device)
            embedding = model(image).cpu().numpy().flatten()
            embeddings.append(embedding)

    return np.array(embeddings, dtype=np.float32)


# Load Inception model and preprocessing pipeline
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = inception_v3(pretrained=True, transform_input=True).to(device)
model.fc = torch.nn.Identity()  # Remove classification head to get embeddings

# Define image preprocessing
preprocess = transforms.Compose([
    transforms.Resize((299, 299)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Generate embeddings for train and test sets
train_embeddings = generate_inception_embeddings(train_files, model, preprocess, device)
test_embeddings = generate_inception_embeddings(test_files, model, preprocess, device)


# Function to create FAISS index
def create_faiss_index(embeddings, image_paths):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index = faiss.IndexIDMap(index)

    # Add vectors to the index with IDs
    index.add_with_ids(embeddings, np.array(range(len(image_paths))))

    return index


# Create FAISS index for the train set
train_index = create_faiss_index(train_embeddings, train_files)


# Function to retrieve similar images
def retrieve_similar_images(query_path, preprocess, model, index, image_paths, top_k=3):
    query_image = Image.open(query_path).convert('RGB')
    query_tensor = preprocess(query_image).unsqueeze(0).to(device)

    model.eval()
    with torch.no_grad():
        query_features = model(query_tensor).cpu().numpy().astype(np.float32).reshape(1, -1)

    distances, indices = index.search(query_features, top_k)
    retrieved_images = [image_paths[int(idx)] for idx in indices[0]]

    return retrieved_images


# Function to calculate top-k accuracy
def calculate_accuracy(test_files, train_index, train_files, model, preprocess, top_k=3):
    correct_matches = 0

    for test_path in test_files:
        # Ground truth: file belongs to the same category (e.g., same folder)
        true_category = os.path.basename(test_path).split('-')[0]  # Example: "pexels-cat.jpg" -> "pexels"

        # Retrieve similar images
        retrieved_images = retrieve_similar_images(test_path, preprocess, model, train_index, train_files, top_k)

        # Check if any retrieved image belongs to the same category
        retrieved_categories = [os.path.basename(img).split('-')[0] for img in retrieved_images]
        if true_category in retrieved_categories:
            correct_matches += 1

    accuracy = correct_matches / len(test_files)
    print(f"Top-{top_k} accuracy: {accuracy:.2f}")
    return accuracy


# Calculate accuracy
calculate_accuracy(test_files, train_index, train_files, model, preprocess, top_k=3)


[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m
[0m

ModuleNotFoundError: No module named 'faiss'

In [None]:
print(f"Test files: {len(test_files)}")
print(f"Train files: {len(train_files)}")

In [None]:
# Folder containing images
image_folder = '/content/image_dataset'

# Get a list of all image files in the folder
image_files = glob(os.path.join(image_folder, '*.jpg'))

# Ensure image_files is not empty
assert len(image_files) > 0, "No images found in the dataset folder!"

# Split the dataset into train and test sets
from sklearn.model_selection import train_test_split
train_files, test_files = train_test_split(image_files, test_size=0.2, random_state=42)

# Debugging: Print lengths
print(f"Train images: {len(train_files)}, Test images: {len(test_files)}")


In [None]:
# Function to calculate recall per query
def calculate_recall(test_files, train_index, train_files, model, preprocess, top_k=3):
    recalls = []  # To store recall for each query

    for test_path in test_files:
        # Ground truth: all images in the same category (e.g., same folder or prefix)
        true_category = os.path.basename(test_path).split('-')[0]  # Example: "pexels-cat.jpg" -> "pexels"
        relevant_images = [img for img in train_files if os.path.basename(img).split('-')[0] == true_category]
        total_relevant = len(relevant_images)

        # Retrieve top-k similar images
        retrieved_images = retrieve_similar_images(test_path, preprocess, model, train_index, train_files, top_k)

        # Count relevant images in the top-k retrieved results
        retrieved_relevant_count = sum(1 for img in retrieved_images if os.path.basename(img).split('-')[0] == true_category)

        # Calculate recall for this query
        recall = retrieved_relevant_count / total_relevant if total_relevant > 0 else 0
        recalls.append(recall)

        print(f"Query: {os.path.basename(test_path)} | Recall: {recall:.2f}")

    # Average recall across all queries
    mean_recall = sum(recalls) / len(recalls)
    print(f"Mean Recall across all queries: {mean_recall:.2f}")
    return recalls, mean_recall


# Example Usage: Calculate recall
recalls, mean_recall = calculate_recall(test_files, train_index, train_files, model, preprocess, top_k=3)


In [None]:
import os
from glob import glob
from sklearn.model_selection import train_test_split

# Folder containing images
image_folder = '/content/image_dataset'

# Get a list of all image files in the folder
image_files = glob(os.path.join(image_folder, '*.jpg'))

# Ensure image_files is not empty
assert len(image_files) > 0, "No images found in the dataset folder!"

# Split the dataset into train and test sets
train_files, test_files = train_test_split(image_files, test_size=0.2, random_state=42)

# Debugging: Print dataset split info
print(f"Train images: {len(train_files)}, Test images: {len(test_files)}")
print(f"Example train file: {train_files[0]}")
print(f"Example test file: {test_files[0]}")

# Rest of the recall and accuracy calculation code goes here...
