In [None]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from torchvision import transforms, models
from torchvision.models import ConvNeXt_Tiny_Weights
import torch.nn as nn

# 1. Data Loading Functions
def load_labeled_data(file_path):
    """Loads the labeled dataset (D1)."""
    data_dict = torch.load(file_path)
    return data_dict['data'], data_dict['targets']

def load_unlabeled_data(file_path):
    """Loads the unlabeled datasets (D2 to D10)."""
    data_dict = torch.load(file_path)
    return data_dict['data']

# 2. Image Preprocessing
transform_pipeline = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet Stats
])

def preprocess_images(images):
    """Preprocess images using the defined transform pipeline."""
    return torch.stack([transform_pipeline(image / 255.0) for image in images])

# 3. Feature Extraction Model
class FeatureExtractor(nn.Module):
    def __init__(self):
        super(FeatureExtractor, self).__init__()
        convnext_model = models.convnext_tiny(weights=ConvNeXt_Tiny_Weights.IMAGENET1K_V1)
        self.feature_extractor = nn.Sequential(*list(convnext_model.children())[:-2])  # Remove final layers

    def forward(self, x):
        x = self.feature_extractor(x)
        return x.flatten(start_dim=1)  # Flatten the feature map

embedder_model = FeatureExtractor().eval()

# 4. Embedding Extraction
def get_embeddings(dataset, embedder_model, batch_size=32):
    """Extract embeddings from a dataset using the pre-trained feature extractor."""
    embeddings = []
    with torch.no_grad():
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        dataset = dataset.to(device)
        embedder_model = embedder_model.to(device)

        for i in tqdm(range(0, len(dataset), batch_size), desc="Extracting Embeddings"):
            batch = dataset[i:i + batch_size]
            embeddings.append(embedder_model(batch).cpu().numpy())
    return np.vstack(embeddings)

# 5. Prototypes-based Classification Model
class PrototypeClassifier:
    def __init__(self):
        self.class_prototypes = {}

    def fit(self, X, y):
        """Fits the model by computing class prototypes."""
        for label in np.unique(y):
            self.class_prototypes[label] = np.mean(X[y == label], axis=0)

    def predict(self, X):
        """Predicts the class by finding the closest prototype."""
        predictions = []
        for instance in X:
            distances = {label: np.linalg.norm(instance - prototype) for label, prototype in self.class_prototypes.items()}
            predictions.append(min(distances, key=distances.get))
        return np.array(predictions)

# 6. Function to Update Model with Pseudo-Labels
def update_classifier_with_pseudo_labels(current_classifier, new_data, batch_size=32):
    """
    Updates the classifier with pseudo-labeled data.

    Args:
        current_classifier: The current classifier model.
        new_data: New unlabeled data.
        batch_size: Batch size for embedding extraction.

    Returns:
        Updated classifier and the predicted labels.
    """
    # Get embeddings for the new data
    new_data_embeddings = get_embeddings(new_data, embedder_model, batch_size=batch_size)

    # Predict pseudo-labels for the new data
    predicted_labels = current_classifier.predict(new_data_embeddings)

    # Create and train a new model using pseudo-labeled data
    updated_classifier = PrototypeClassifier()
    updated_classifier.fit(new_data_embeddings, predicted_labels)

    return updated_classifier

# 7. Sequential Training with Memory Efficiency
train_datasets_paths = [f"/kaggle/input/cs771-mp2/dataset/part_one_dataset/train_data/{i}_train_data.tar.pth" for i in range(1, 11)]
eval_datasets_paths = [f"/kaggle/input/cs771-mp2/dataset/part_one_dataset/eval_data/{i}_eval_data.tar.pth" for i in range(1, 11)]

# Initialize models list and accuracy matrix
trained_models = []
accuracy_table = np.zeros((20, 20))

# Train the first model f1 on D1
print("Training model f1...")
train_data, train_labels = load_labeled_data(train_datasets_paths[0])  # Load labeled data for D1
train_data = preprocess_images(torch.tensor(train_data).permute(0, 3, 1, 2))
train_labels = torch.tensor(train_labels)

train_embeddings = get_embeddings(train_data, embedder_model)
model_f1 = PrototypeClassifier()
model_f1.fit(train_embeddings, train_labels.numpy())
trained_models.append(model_f1)

# Evaluate model f1
for j, eval_file in enumerate(eval_datasets_paths[:1]):  # Only evaluate on D̂1
    eval_data, eval_labels = load_labeled_data(eval_file)
    eval_data = preprocess_images(torch.tensor(eval_data).permute(0, 3, 1, 2))
    eval_embeddings = get_embeddings(eval_data, embedder_model)
    eval_labels = torch.tensor(eval_labels)

    predictions = model_f1.predict(eval_embeddings)
    accuracy = accuracy_score(eval_labels.numpy(), predictions)
    accuracy_table[0, j] = accuracy
    print(f"Accuracy of f1 on D̂{j+1}: {accuracy:.4f}")

# Sequential Training for models f2 to f10
for i in range(1, 10):
    print(f"Training model f{i+1}...")
    train_data = load_unlabeled_data(train_datasets_paths[i])  # Load unlabeled data for D2, ..., D10
    train_data = preprocess_images(torch.tensor(train_data).permute(0, 3, 1, 2))

    # Update the model with pseudo-labels
    current_model = trained_models[-1]
    updated_model = update_classifier_with_pseudo_labels(current_model, train_data)
    trained_models.append(updated_model)

    # Evaluate the updated model on previous datasets
    for j, eval_file in enumerate(eval_datasets_paths[:i + 1]):
        eval_data, eval_labels = load_labeled_data(eval_file)
        eval_data = preprocess_images(torch.tensor(eval_data).permute(0, 3, 1, 2))
        eval_embeddings = get_embeddings(eval_data, embedder_model)
        eval_labels = torch.tensor(eval_labels)

        predictions = updated_model.predict(eval_embeddings)
        accuracy = accuracy_score(eval_labels.numpy(), predictions)
        accuracy_table[i, j] = accuracy
        print(f"Accuracy of f{i+1} on D̂{j+1}: {accuracy:.4f}")


# Task 2 - Updated Code for f11 to f20
train_datasets_part_two = [f"{i}_train_data.tar.pth" for i in range(1, 11)]
eval_datasets_part_two = [f"{i}_eval_data.tar.pth" for i in range(1, 11)]

# Load data for part two
def load_part_two_data(file_path):
    """Loads datasets from part two (D11 to D20)."""
    data_dict = torch.load(file_path)
    return data_dict['data']  # Only the data, no labels

# Combine part one and part two datasets
combined_train_datasets = train_datasets_paths + train_datasets_part_two
combined_eval_datasets = eval_datasets_paths + eval_datasets_part_two

# Initialize accuracy table for part two
accuracy_table_part_two = np.zeros((10, 20))

# Train models f11 to f20
for i in range(10, 20):
    print(f"Training model f{i+1}...")

    # Load unlabeled data D11 to D20
    train_data = load_unlabeled_data(combined_train_datasets[i])
    train_data = preprocess_images(torch.tensor(train_data).permute(0, 3, 1, 2))

    # Get pseudo-labels for the new data
    current_model = trained_models[-1]
    pseudo_labels = current_model.predict(get_embeddings(train_data, embedder_model))

    # Train the new model using pseudo-labeled data
    updated_model = PrototypeClassifier()
    updated_model.fit(get_embeddings(train_data, embedder_model), pseudo_labels)
    trained_models.append(updated_model)

    # Evaluate the model on all held-out datasets
    for j, eval_file in enumerate(combined_eval_datasets[:i + 1]):
        eval_data, eval_labels = load_labeled_data(eval_file)
        eval_data = preprocess_images(torch.tensor(eval_data).permute(0, 3, 1, 2))
        eval_embeddings = get_embeddings(eval_data, embedder_model)
        eval_labels = torch.tensor(eval_labels)

        predictions = updated_model.predict(eval_embeddings)
        accuracy = accuracy_score(eval_labels.numpy(), predictions)
        accuracy_table_part_two[i-10, j] = accuracy
        print(f"Accuracy of f{i+1} on D̂{j+1}: {accuracy:.4f}")

# Print final accuracy table for models f11 to f20
print("Final Accuracy Table (Models vs Held-out Datasets for Part Two):")
print(accuracy_table_part_two)


#TASK 2
Transfer learning: The use of the pre-trained ConvNeXt model for feature extraction can be considered as a form of transfer learning. The model adapts its learned knowledge from the initial dataset (ImageNet) to the specific task at hand. This adaptation improves the model’s ability to generalize to new, unlabeled data (as represented in datasets D2 to D10).