In [10]:
import os
import torch
import sys
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision.transforms import functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.io import read_image
import time
import json
from tqdm import tqdm
from torch.utils.data import random_split
import pandas as pd
import matplotlib.pyplot as plt
from torchvision.models.resnet import ResNet50_Weights
import shutil
from sklearn.model_selection import train_test_split
from ups_utils import train_initial, train_regular, pseudo_labeling

In [11]:
# Params
training_date = time.strftime("%Y%m%d-%H%M%S")
#training_date = "20240408-190635"
BACTH_SIZE = 64
MODEL_NAME = f'MDM_IaA_ups_01_dataset_01_labeled_{training_date}'
DATASET_FILE_PATH = 'C:/Users/janny/Aalto_project_2/data/full_dataset_IaA.txt'
MODEL_DIR = os.path.join("C:/Users/janny/Aalto_project_2/models", MODEL_NAME)
TAU_P = 0.90 # Confidence threshold for positive pseudo-labels, default is 0.70
TAU_N = 0.1 # Confidence threshold for negative pseudo-labels, default is 0.05
KAPPA_P = 0.05 # Uncertainty threshold for positive pseudo-labels, default is 0.05
KAPPA_N = 0.005 # Uncertainty threshold for negative pseudo-labels, default is 0.005
TEMP_NL = 2.0 # Temperature for generating negative pseudo-labels, default is 2.0
P_LABELED = 0.1 # % of data that is pre labeled
ITERATIONS = 5
MAX_EPOCH = 20
SEED = 42

In [12]:
os.makedirs(MODEL_DIR, exist_ok=True)
destination_dir = os.path.join(MODEL_DIR, 'data')
os.makedirs(destination_dir, exist_ok=True)
destination_file_path = os.path.join(destination_dir, os.path.basename(DATASET_FILE_PATH))
shutil.copy(DATASET_FILE_PATH, destination_file_path)
DATASET_FILE_PATH = destination_file_path

all_data = pd.read_csv(DATASET_FILE_PATH, sep=', ', header=None)
class_counts = all_data.iloc[:, 12].value_counts()
print(class_counts)
sampled_data = pd.DataFrame()
for label, count in class_counts.items():
    # Sample 10% of the data for the current class
    sampled_df = all_data[all_data.iloc[:, 12] == label].sample(frac=0.1, random_state=SEED)
    sampled_data = pd.concat([sampled_data, sampled_df])

SAMPLED_DATASET_FILE_PATH = os.path.join(MODEL_DIR, 'data', 'dataset_01.csv')
sampled_data.to_csv(SAMPLED_DATASET_FILE_PATH, index=False, header=None)

DATASET_FILE_PATH = SAMPLED_DATASET_FILE_PATH


  all_data = pd.read_csv(DATASET_FILE_PATH, sep=', ', header=None)


12
0    13281
1     7821
Name: count, dtype: int64


In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda:0


In [14]:
class ResizeToWidth:
    def __init__(self, target_width):
        self.target_width = target_width

    def __call__(self, img_tensor):
        # Make sure it's a float for division to work properly in Python 2
        original_width = img_tensor.shape[2]
        original_height = img_tensor.shape[1]
        aspect_ratio = float(original_height) / float(original_width)
        target_height = int(self.target_width * aspect_ratio)

        # Resize the tensor
        img_tensor = F.resize(img_tensor, [target_height, self.target_width])
        return img_tensor
    
class Normalize3Channel:
    def __init__(self, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
        self.mean = mean
        self.std = std

    def __call__(self, img_tensor):
        # Check if the image has 3 channels
        img_tensor = img_tensor.float() / 255.0
        if img_tensor.shape[0] == 3:
            img_tensor = F.normalize(img_tensor, mean=self.mean, std=self.std)
        return img_tensor

In [15]:
class LabeledDataset(Dataset):
    def __init__(self, annotations, transform=None):
        self.annotations = annotations
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        row = self.annotations.iloc[idx]
        img_paths = row.iloc[:5].values  # Paths to the image files.
        spectrogram_paths = row.iloc[5:12].values  # Paths to the audio spectrogram files.
        label = row.iloc[12]
        
        # Load images and spectrograms, apply the same transform if available
        images = [read_image(path) for path in img_paths]
        spectrograms = [read_image(path) for path in spectrogram_paths]
        
        if self.transform:
            images = [self.transform(image) for image in images]
            spectrograms = [self.transform(spectrogram) for spectrogram in spectrograms]

        return images, spectrograms, label
    
class UnlabeledDataset(Dataset):
    def __init__(self, annotations, transform=None):
        self.annotations = annotations
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        row = self.annotations.iloc[idx]
        img_paths = row.iloc[:5].values  # Paths to the image files.
        spectrogram_paths = row.iloc[5:12].values  # Paths to the audio spectrogram files.
        label = row.iloc[12]
        
        # Load images and spectrograms, apply the same transform if available
        images = [read_image(path) for path in img_paths]
        spectrograms = [read_image(path) for path in spectrogram_paths]
        
        if self.transform:
            images = [self.transform(image) for image in images]
            spectrograms = [self.transform(spectrogram) for spectrogram in spectrograms]

        return images, spectrograms, label, idx
    
class PseudoLabeledDataset(Dataset):
    def __init__(self, annotations, pseudo_labels, transform=None):
        """
        annotations: DataFrame containing the data paths and original labels.
        pseudo_labels: List or array containing the pseudo labels for the data.
        negative_label_mask: List or array indicating if a label is a negative pseudo-label.
        transform: A function/transform that takes in an PIL image and returns a transformed version.
        """
        self.annotations = annotations
        self.pseudo_labels = pseudo_labels
        # self.negative_label_mask = negative_label_mask
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        row = self.annotations.iloc[idx]
        img_paths = row.iloc[:5].values  # Paths to the image files.
        spectrogram_paths = row.iloc[5:12].values  # Paths to the audio spectrogram files.
        pseudo_label = self.pseudo_labels[idx]
        # is_negative = self.negative_label_mask[idx]
        
        # Assume `read_image` function is defined elsewhere
        images = [read_image(path) for path in img_paths]
        spectrograms = [read_image(path) for path in spectrogram_paths]
        
        if self.transform:
            images = [self.transform(image) for image in images]
            spectrograms = [self.transform(spectrogram) for spectrogram in spectrograms]

        return images, spectrograms, pseudo_label

In [16]:
def getDatasets(pseudo_labels_dict=None):
    transform = transforms.Compose([
        ResizeToWidth(512),  # Resize width to 512 pixels while maintaining aspect ratio
        Normalize3Channel(),
    ])

    all_data = pd.read_csv(DATASET_FILE_PATH, sep=',')
    train_data, test_data = train_test_split(all_data, test_size=0.1, random_state=42)
    labeled, unlabeled = train_test_split(train_data, test_size=(1 - P_LABELED), random_state=42)

    labeled_dataset = LabeledDataset(labeled, transform)
    unlabeled_dataset = UnlabeledDataset(unlabeled, transform)
    pseudo_labeled_dataset = unlabeled_dataset
    test_dataset = LabeledDataset(test_data, transform)

    if pseudo_labels_dict is not None:
        psuedo_labeled_indexes = pseudo_labels_dict['psuedo_labeled_indexes']
        psuedo_labeled_targets = pseudo_labels_dict['psuedo_labeled_targets']
        # negative_label_mask = pseudo_labels_dict['negative_label_mask']

        pseudo_labeled_data = unlabeled.iloc[psuedo_labeled_indexes]
        pseudo_labeled_dataset = PseudoLabeledDataset(
            pseudo_labeled_data, psuedo_labeled_targets, transform
        )

    return labeled_dataset, pseudo_labeled_dataset, unlabeled_dataset, test_dataset

In [17]:
class ImageFeatureExtractor(nn.Module):
    def __init__(self, output_features):
        super(ImageFeatureExtractor, self).__init__()
        resnet50 = models.resnet50(weights=ResNet50_Weights.DEFAULT)
        self.features = nn.Sequential(*list(resnet50.children())[:-1])  # Remove the last layer

        # Freeze the parameters in the feature extraction layers
        for param in self.features.parameters():
            param.requires_grad = False

        self.fc = nn.Linear(resnet50.fc.in_features, output_features)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc(x)
        return x
    
class AudioFeatureExtractor(nn.Module):
    def __init__(self, output_features):
        super(AudioFeatureExtractor, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.fc = nn.Linear(64 * 23 * 64, output_features)  # Adjust the size based on your input dimensions

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc(x)
        return x
    
class CombinedModel(nn.Module):
    def __init__(self, image_output_features, audio_output_features, num_classes):
        super(CombinedModel, self).__init__()
        self.image_extractor = ImageFeatureExtractor(image_output_features)
        self.audio_extractor = AudioFeatureExtractor(audio_output_features)

        # Combine features from both extractors
        total_features = image_output_features * 5 + audio_output_features * 7
        self.classifier = nn.Linear(total_features, num_classes)

    def forward(self, image_0, image_1, image_2, image_3, image_4,
        audio_0, audio_1, audio_2, audio_3, audio_4, audio_5, audio_6
    ):
        image_features_0 = self.image_extractor(image_0)
        image_features_1 = self.image_extractor(image_1)
        image_features_2 = self.image_extractor(image_2)
        image_features_3 = self.image_extractor(image_3)
        image_features_4 = self.image_extractor(image_4)

        audio_features_0 = self.audio_extractor(audio_0)
        audio_features_1 = self.audio_extractor(audio_1)
        audio_features_2 = self.audio_extractor(audio_2)
        audio_features_3 = self.audio_extractor(audio_3)
        audio_features_4 = self.audio_extractor(audio_4)
        audio_features_5 = self.audio_extractor(audio_5)
        audio_features_6 = self.audio_extractor(audio_6)
        
        combined_features = torch.cat((
            image_features_0, image_features_1, image_features_2, image_features_3, image_features_4,
            audio_features_0, audio_features_1, audio_features_2, audio_features_3, audio_features_4,
            audio_features_5, audio_features_6
        ), dim=1)
        output = self.classifier(combined_features)
        return output

In [18]:
def run_ups():
    pseudo_labels_dict = None
    # pseudo_labels_dict = {
    #     "psuedo_labeled_indexes": [0, 2, 4],
    #     "psuedo_labeled_targets": [0, 0, 1],
    #     "negative_label_mask": [1, 0, 1]
    # }

    for itr in range(ITERATIONS):
        # get labeled_dataset, negative_label_dataset, unlabeled_dataset, test_dataset
        labeled_dataset, pseudo_labeled_dataset, unlabeled_dataset, test_dataset = getDatasets(pseudo_labels_dict)

        labeled_dataloader = DataLoader(labeled_dataset, batch_size=BACTH_SIZE, shuffle=False)
        pseudo_labeled_dataloader = DataLoader(pseudo_labeled_dataset, batch_size=BACTH_SIZE, shuffle=False)
        unlabeled_dataloader = DataLoader(unlabeled_dataset, batch_size=BACTH_SIZE, shuffle=False)
        test_dataloader = DataLoader(test_dataset, batch_size=BACTH_SIZE, shuffle=False)

        # Initialize new model
        image_output_features = 512 # Adjust as needed
        audio_output_features = 256  # Adjust as needed
        num_classes = 2  # Adjust based on your dataset

        model = CombinedModel(image_output_features, audio_output_features, num_classes)
        model.to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Learning rate can be adjusted
        save_dir = f'{MODEL_DIR}/itr_{itr}'
        os.makedirs(save_dir, exist_ok=True)
        if itr == 0:
            train_initial(MAX_EPOCH, model, labeled_dataloader, test_dataloader, optimizer, device, save_dir)
        else:
            train_regular(MAX_EPOCH, model, labeled_dataloader, pseudo_labeled_dataloader, test_dataloader, optimizer, device, save_dir)

        model = torch.load(f'{save_dir}/best_model.pth').to(device)

        pseudo_labels_dict = pseudo_labeling(unlabeled_dataloader, model, device, TEMP_NL, KAPPA_N, TAU_N,
                        TAU_P, KAPPA_P, no_uncertainty=True)
        

try:
    with open(f'{MODEL_DIR}/output.txt', 'w') as f:
        # Saving the original stdout 
        original_stdout = sys.stdout 
        sys.stdout = f  # Change the standard output to the file we created.
        
        run_ups()
        
except Exception as e:
    print(f"An error occurred: {e}")  # This prints the error to the console.
    
finally:
    sys.stdout = original_stdout  # Reset the standard output to its original value,
      

100%|██████████| 3/3 [00:14<00:00,  4.74s/it]
100%|██████████| 4/4 [00:14<00:00,  3.67s/it]
100%|██████████| 3/3 [00:14<00:00,  4.74s/it]
100%|██████████| 4/4 [00:15<00:00,  3.80s/it]
100%|██████████| 3/3 [00:13<00:00,  4.59s/it]
100%|██████████| 4/4 [00:14<00:00,  3.70s/it]
100%|██████████| 3/3 [00:13<00:00,  4.52s/it]
100%|██████████| 4/4 [00:14<00:00,  3.72s/it]
100%|██████████| 3/3 [00:13<00:00,  4.52s/it]
100%|██████████| 4/4 [00:14<00:00,  3.68s/it]
100%|██████████| 3/3 [00:13<00:00,  4.57s/it]
100%|██████████| 4/4 [00:14<00:00,  3.72s/it]
100%|██████████| 3/3 [00:13<00:00,  4.47s/it]
100%|██████████| 4/4 [00:14<00:00,  3.65s/it]
100%|██████████| 3/3 [00:13<00:00,  4.53s/it]
100%|██████████| 4/4 [00:14<00:00,  3.66s/it]
100%|██████████| 3/3 [00:13<00:00,  4.64s/it]
100%|██████████| 4/4 [00:14<00:00,  3.73s/it]
100%|██████████| 3/3 [00:13<00:00,  4.66s/it]
100%|██████████| 4/4 [00:14<00:00,  3.68s/it]
100%|██████████| 3/3 [00:13<00:00,  4.63s/it]
100%|██████████| 4/4 [00:14<00:00,