In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/train-data/train_folds.csv
/kaggle/input/emo-map-challenge/sample_submission.csv
/kaggle/input/emo-map-challenge/train_dataset.csv
/kaggle/input/emo-map-challenge/test_dataset.csv


In [2]:
!pip install wtfml #Library used early stoping

Collecting wtfml
  Downloading wtfml-0.0.3-py3-none-any.whl.metadata (808 bytes)
Downloading wtfml-0.0.3-py3-none-any.whl (10 kB)
Installing collected packages: wtfml
Successfully installed wtfml-0.0.3


In [3]:
import os
import torch
import albumentations
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn import metrics
from sklearn import model_selection
from torch.nn import functional as F
from wtfml.utils import EarlyStopping

  check_for_updates()


In [4]:
# AS the data is imbalanced and relativley small, dataset divided into 5 folds,maintaining 
# the class distribution same as training data set
df = pd.read_csv("/kaggle/input/emo-map-challenge/train_dataset.csv")
df["kfold"] = -1    
df = df.sample(frac=1).reset_index(drop=True)
y = df.emotion.values
kf = model_selection.StratifiedKFold(n_splits=5)

for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f

df.to_csv("train_folds.csv", index=False)
# this file directly uploaded in the input to use directly

In [5]:
# upon experimenting on all the pretrained versions, I found that RESNET Familiy is doing the best,
# so i have started with pretrained versions of those.
import torch
import torch.nn as nn
import torchvision.models as models

class ResNetForMultiClass(nn.Module):
    def __init__(self, num_classes=7, model_type='resnet152', pretrained=True):
        super(ResNetForMultiClass, self).__init__()
        
        # Select the base model based on the model_type parameter
        if model_type == 'resnet101':
            self.base_model = models.resnet101(pretrained=pretrained)
        elif model_type == 'resnet152':
            self.base_model = models.resnet152(pretrained=pretrained)
        else:
            raise ValueError(f"Unsupported model type: {model_type}")
        
        # Modify the fully connected layer to match the number of classes
        self.base_model.fc = nn.Linear(
            in_features=self.base_model.fc.in_features,
            out_features=num_classes
        )
        
    def forward(self, image, targets=None):
        # Forward pass through the base model
        out = self.base_model(image)
        
        # If targets are provided, calculate the loss
        if targets is not None:
            loss = nn.CrossEntropyLoss()(out, targets)
            return out, loss
        
        return out

In [6]:
# Building Custom Dataset Loader
import torch
from torch.utils.data import Dataset
import numpy as np
import cv2
import albumentations as A

class CustomImageDataset(Dataset):
    def __init__(self, pixel_arrays, targets, resize=None, augmentations=None):
        self.pixel_arrays = pixel_arrays
        self.targets = targets
        self.resize = resize
        self.augmentations = augmentations

    def __len__(self):
        return len(self.pixel_arrays)

    def __getitem__(self, idx):
        # Convert the 1D array to a 2D grayscale image
        image = np.array(self.pixel_arrays[idx], dtype=np.float32).reshape(48, 48)
        
        # Resize image if needed
        if self.resize:
            image = cv2.resize(image, self.resize)

        # Expand dimensions to (48, 48, 1) and then convert to (new_size, new_size, 3) for RGB
        image = np.expand_dims(image, axis=-1)
        image = np.repeat(image, 3, axis=-1)  # Convert to RGB

        # Apply augmentations if specified
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']

        # Convert image to PyTorch tensor and permute to (3, new_size, new_size) format
        image = torch.tensor(image, dtype=torch.float32).permute(2, 0, 1)  # For RGB, shape becomes (3, new_size, new_size)
        target = torch.tensor(self.targets[idx], dtype=torch.long)

        return image, target

In [7]:
from sklearn import metrics
from tqdm import tqdm

# Methods are responsible for training, validation, and predicting the emotions for the data 
class Engine:
    @staticmethod
    def train(data_loader, model, optimizer, device,scheduler=None, accumulation_steps=1, fp16=False):
        model.train()
        losses = AverageMeter()
        scaler = torch.cuda.amp.GradScaler() if fp16 else None
        
        if accumulation_steps > 1:
            optimizer.zero_grad()
        
        tk0 = tqdm(data_loader, total=len(data_loader))
        for batch_idx, (images, targets) in enumerate(tk0):
            images = images.to(device)
            targets = targets.to(device)

            optimizer.zero_grad()
            with torch.set_grad_enabled(True):
                if fp16:
                    with torch.cuda.amp.autocast():
                        outputs = model(images)
                        loss = torch.nn.CrossEntropyLoss()(outputs, targets)
                    scaler.scale(loss).backward()
                else:
                    outputs = model(images)
                    loss = torch.nn.CrossEntropyLoss()(outputs,targets)
                    loss.backward()

                if (batch_idx + 1) % accumulation_steps == 0:
                    scaler.step(optimizer) if fp16 else optimizer.step()
                    if scheduler:
                        scheduler.step()
                    optimizer.zero_grad()

            losses.update(loss.item(), data_loader.batch_size)
            tk0.set_postfix(loss=losses.avg)
        
        return losses.avg

    @staticmethod
    def evaluate(data_loader, model, device, use_tpu=False):
        losses = AverageMeter()
        final_predictions = []
        model.eval()
        with torch.no_grad():
            tk0 = tqdm(data_loader, total=len(data_loader), disable=use_tpu)
            for b_idx, data in enumerate(tk0):
                images, targets = data  # Adjust if your data format is different

                # Move tensors to device
                images = images.to(device)
                targets = targets.to(device)

                # Forward pass
                predictions, loss = model(images, targets)
                predictions = predictions.cpu()
                losses.update(loss.item(), images.size(0))
                final_predictions.append(predictions)
                tk0.set_postfix(loss=losses.avg)

        # Concatenate all predictions and convert to NumPy array
        final_predictions = torch.cat(final_predictions).numpy()
        return final_predictions, losses.avg
    def predict(data_loader, model, device, use_tpu=False):
        model.eval()
        final_predictions = []
        with torch.no_grad():
            tk0 = tqdm(data_loader, total=len(data_loader), disable=use_tpu)
            for b_idx, data in enumerate(tk0):
                inputs, _ = data  # Unpack data
                inputs = inputs.to(device)
                predictions = model(inputs)  # Assume model returns only predictions
                final_predictions.append(predictions.cpu())
                tk0.set_postfix()
        return torch.cat(final_predictions).numpy()
        # Concatenate all predictions and convert to numpy array
        return torch.cat(final_predictions).numpy()
    
#The AverageMeter class is a utility for tracking and calculating the 
#running average of a metric (such as loss) over multiple updates. 
#It maintains cumulative statistics, including the current value, sum of values, 
#count of values, and the average value.prefered to handle multiple batchoutputs in training and validation

class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

FP16 (16-bit Floating Point Precision)(However it was set to false)

	•	Description: FP16, or half-precision floating point, uses 16-bit numbers instead of the standard 32-bit (FP32). This reduces memory usage and can speed up computations on compatible hardware, improving training efficiency without significantly sacrificing precision.
	•	Usage: In mixed-precision training, torch.cuda.amp.GradScaler and torch.cuda.amp.autocast are used to manage gradient scaling and automatic casting of operations between FP16 and FP32 for better performance and numerical stability.

Scheduler (ReduceLROnPlateau)

	•	Description: The ReduceLROnPlateau scheduler adjusts the learning rate based on the performance of the model. It reduces the learning rate when a specified metric (e.g., validation loss) stops improving, helping to fine-tune the model and prevent overfitting.
	•	Usage: scheduler.step(metrics) is called with the monitored metric to check if the learning rate needs to be adjusted, typically in response to a lack of improvement in the specified metric.

In [8]:
import albumentations as A
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import torch
from sklearn import metrics

def train(fold,model_type,model_name):
    # Load the dataset with fold information
    df = pd.read_csv("/kaggle/input/train-data/train_folds.csv")
    device = "cuda"  # Set the device to CUDA for GPU training
    epochs = 50  # Number of training epochs
    train_bs = 32  # Batch size for training
    valid_bs = 16  # Batch size for validation

    # Split data into training and validation based on the fold
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # Convert pixel values to numpy arrays
    train_pixels = df_train['pixels'].apply(lambda x: np.fromstring(x, sep=' ', dtype=np.float32))
    train_targets = df_train['emotion'].values
    valid_pixels = df_valid['pixels'].apply(lambda x: np.fromstring(x, sep=' ', dtype=np.float32))
    valid_targets = df_valid['emotion'].values

    # Define data augmentation and normalization for training
    mean = (0.485, 0.456, 0.406)
    std = (0.229, 0.224, 0.225)
    train_aug = A.Compose([
        A.Resize(height=224, width=224),  # Resize images to 224x224
        A.Normalize(mean=mean, std=std, max_pixel_value=255.0, always_apply=True),
        A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.1, rotate_limit=15),
        A.HorizontalFlip(p=0.5)  # Randomly flip images horizontally
    ])

    # Define data augmentation and normalization for validation
    valid_aug = A.Compose([
        A.Resize(height=224, width=224),  # Resize images to 224x224
        A.Normalize(mean=mean, std=std, max_pixel_value=255.0, always_apply=True)
    ])

    # Create custom datasets with augmentations
    train_dataset = CustomImageDataset(
        pixel_arrays=train_pixels,
        targets=train_targets,
        resize=(224, 224),  # New size
        augmentations=train_aug,
    )
    valid_dataset = CustomImageDataset(
        pixel_arrays=valid_pixels,
        targets=valid_targets,
        resize=(224, 224),  # New size
        augmentations=valid_aug,
    )

    # Create data loaders for training and validation
    train_loader = DataLoader(
        train_dataset, batch_size=32, shuffle=True, num_workers=4
    )
    valid_loader = DataLoader(
        valid_dataset, batch_size=16, shuffle=False, num_workers=4
    )

    # Initialize the model, optimizer, and learning rate scheduler
    model = model_type
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        patience=5,  # Number of epochs with no improvement to wait before reducing the learning rate
        threshold=0.001,  # Minimum change to qualify as an improvement
        mode="max"  # Mode for monitoring metric improvement
    )

    es = EarlyStopping(patience=5, mode="max")  # Initialize early stopping

    for epoch in range(epochs):
        # Train the model and get the average training loss
        train_loss = Engine.train(train_loader, model, optimizer, device, scheduler=None, accumulation_steps=1, fp16=False)

        # Evaluate the model and get predictions and validation loss
        predictions, valid_loss = Engine.evaluate(valid_loader, model, device=device)
        final_predictions = np.argmax(np.vstack(predictions), axis=1)
        valid_targets = np.array(valid_targets)  # Ensure targets are in the right format
    
        # Calculate and print accuracy
        accuracy = metrics.accuracy_score(valid_targets, final_predictions)
        print(f"Epoch = {epoch}, Accuracy = {accuracy:.4f}")

        # Update the learning rate based on validation accuracy
        scheduler.step(accuracy)
    
        # Apply early stopping and saves the model 
        es(accuracy, model, model_path=(f"model_fold_{model_name}_{fold}.bin"))
        if es.early_stop:
            print("Early stopping")
            break

    # Create a DataFrame with out-of-fold predictions
    oof_data = {
        'id': df_valid.index,  # Use the index to map back to the original data
        'true_emotion': valid_targets,
        'pred_emotion': final_predictions,
    }
    
    return pd.DataFrame(oof_data)

In [9]:
import pandas as pd
import numpy as np
import torch
import albumentations
from torch.utils.data import DataLoader

def predict(fold,model_type,model_name):
    # Load the test dataset
    df = pd.read_csv("/kaggle/input/emo-map-challenge/test_dataset.csv")
    device = "cuda"  # Set device to GPU
    model_path = f"model_fold_{model_name}_{fold}.bin"  # Path to the trained model

    # Convert pixel data from string to numpy arrays
    test_pixels = df['pixels'].apply(lambda x: np.fromstring(x, sep=' ', dtype=np.float32))
    
    # Define the augmentation pipeline for test data
    mean = (0.485, 0.456, 0.406)
    std = (0.229, 0.224, 0.225)
    aug = albumentations.Compose([
        albumentations.Resize(height=224, width=224),  # Resize images to 224x224
        albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True)  # Normalize images
    ])
    
    # Initialize dummy targets as placeholders
    targets = np.zeros(len(df))
    
    # Create the test dataset with augmentations
    test_dataset = CustomImageDataset(
        pixel_arrays=test_pixels,
        targets=targets,
        resize=None,  # Resize is handled by augmentations
        augmentations=aug,
    )
    
    # Create a DataLoader for the test dataset
    test_loader = DataLoader(
        test_dataset, batch_size=16, shuffle=False, num_workers=4
    )

    # Initialize and load the model
    model = model_type # Function to initialize the model (make sure it matches the one used during training)
    model.load_state_dict(torch.load(model_path))  # Load the trained model weights
    model.to(device)  # Move the model to GPU

    # Generate predictions using the test data
    predictions = Engine.predict(test_loader, model, device=device)
    predictions = np.vstack((predictions))  # Stack predictions into a single array

    return predictions

In [10]:
model_101 = ResNetForMultiClass(num_classes=7, model_type='resnet101', pretrained=True)
model_150 = ResNetForMultiClass(num_classes=7, model_type='resnet152', pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet101-63fe2227.pth" to /root/.cache/torch/hub/checkpoints/resnet101-63fe2227.pth
100%|██████████| 171M/171M [00:01<00:00, 163MB/s]
Downloading: "https://download.pytorch.org/models/resnet152-394f9c45.pth" to /root/.cache/torch/hub/checkpoints/resnet152-394f9c45.pth
100%|██████████| 230M/230M [00:04<00:00, 50.8MB/s]


In [11]:
#Discriminative Feature Attention Network (applied after major blocks)
#Model 3 is also a Resnet34, But modified with the Discrminative feature selection attenation Mechanism, 
#So I'll the that model from scratch
class DiscriminativeFeatureAttentionNetwork(nn.Module):
    def __init__(self, input_channels, reduction_ratio=16):
        super(DiscriminativeFeatureAttentionNetwork, self).__init__()
        
        # Global average pooling layer
        self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
        
        # Fully connected layers to compute attention weights
        self.fc1 = nn.Linear(input_channels, input_channels // reduction_ratio)
        self.fc2 = nn.Linear(input_channels // reduction_ratio, input_channels)
        
        # Activation functions
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Input x has shape: (batch_size, channels, height, width)
        
        # Step 1: Global average pooling to get (batch_size, channels)
        b, c, _, _ = x.size()
        avg_pooled = self.global_avg_pool(x).view(b, c)
        
        # Step 2: Pass through the fully connected layers
        fc1_out = self.relu(self.fc1(avg_pooled))
        fc2_out = self.sigmoid(self.fc2(fc1_out))
        
        # Step 3: Reshape the attention weights to (batch_size, channels, 1, 1)
        attention_weights = fc2_out.view(b, c, 1, 1)
        
        # Step 4: Apply attention weights to input feature maps
        out = x * attention_weights  # Element-wise multiplication to focus on discriminative features
        
        return out

# ResNet101 with integrated Discriminative Feature Attention for multi-class classification
class ResNet34ForMultiClassWithAttention(nn.Module):
    def __init__(self, num_classes=7, pretrained=True):
        super(ResNet34ForMultiClassWithAttention, self).__init__()
        
        # Load the pre-trained ResNet101 model
        self.base_model = models.resnet34(pretrained=pretrained)
        
        # Modify the fully connected layer to match the number of classes
        self.base_model.fc = nn.Linear(
            in_features=self.base_model.fc.in_features,
            out_features=num_classes
        )
        
        # Add the Discriminative Feature Attention after layer3 of ResNet101
        self.discriminative_attention = DiscriminativeFeatureAttentionNetwork(input_channels=256)  # 1024 channels in layer3 of ResNet101

    def forward(self, image, targets=None):
        batch_size = image.size(0)
        
        # Forward pass through the base model's initial layers
        x = self.base_model.conv1(image)
        x = self.base_model.bn1(x)
        x = self.base_model.relu(x)
        x = self.base_model.maxpool(x)

        # Forward through ResNet's layer1, layer2, and layer3
        x = self.base_model.layer1(x)
        x = self.base_model.layer2(x)
        x = self.base_model.layer3(x)
        
        # Apply Discriminative Feature Attention after layer3
        x = self.discriminative_attention(x)
        
        # Forward through ResNet's layer4
        x = self.base_model.layer4(x)
        
        # Global average pooling before the final classification layer
        x = self.base_model.avgpool(x)
        x = torch.flatten(x, 1)
        
        # Get the raw logits from the fully connected layer
        out = self.base_model.fc(x)
        
        # If targets are provided, calculate the loss
        if targets is not None:
            loss = nn.CrossEntropyLoss()(out, targets)
            return out, loss
        
        return out


In [12]:
model_1 = ResNetForMultiClass(num_classes=7, model_type='resnet152', pretrained=True)
model_2 = ResNetForMultiClass(num_classes=7, model_type='resnet101', pretrained=True)
model_3 = ResNet34ForMultiClassWithAttention(num_classes = 7,pretrained = True)

Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /root/.cache/torch/hub/checkpoints/resnet34-b627a593.pth
100%|██████████| 83.3M/83.3M [00:00<00:00, 172MB/s]


In [13]:
import warnings
from warnings import filterwarnings
model_types = [model_1, model_2, model_3]
model_names = ['RES152','RES101','RES32D']
for model_type, model_name in zip(model_types, model_names):
    X = np.empty((0, 3))
    for fold in range(5):
        # Train the model for the current fold and model type
        df = train(fold, model_type,model_name)
        df.to_csv(f"oof{model_name}-{fold}.csv", index=False) 
        X = np.concatenate((X, df.values))  
    df_combined = pd.DataFrame(X, columns=['id', 'true_emotion', 'pred_emotion']) 
    df_combined.to_csv(f'{model_name}.csv', index=False)

100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=1.44]
100%|██████████| 63/63 [00:07<00:00,  8.04it/s, loss=1.34]


Epoch = 0, Accuracy = 0.4790
Validation score improved (-inf --> 0.479). Saving model!


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=1.11]
100%|██████████| 63/63 [00:07<00:00,  8.26it/s, loss=1.19]


Epoch = 1, Accuracy = 0.5350
Validation score improved (0.479 --> 0.535). Saving model!


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.975]
100%|██████████| 63/63 [00:07<00:00,  8.21it/s, loss=1.24]


Epoch = 2, Accuracy = 0.5380
Validation score improved (0.535 --> 0.538). Saving model!


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.816]
100%|██████████| 63/63 [00:07<00:00,  8.19it/s, loss=1.25]


Epoch = 3, Accuracy = 0.5640
Validation score improved (0.538 --> 0.564). Saving model!


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.695]
100%|██████████| 63/63 [00:07<00:00,  8.22it/s, loss=1.24]


Epoch = 4, Accuracy = 0.5810
Validation score improved (0.564 --> 0.581). Saving model!


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.581]
100%|██████████| 63/63 [00:07<00:00,  8.22it/s, loss=1.4]


Epoch = 5, Accuracy = 0.5530
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.465]
100%|██████████| 63/63 [00:07<00:00,  8.21it/s, loss=1.57]


Epoch = 6, Accuracy = 0.5380
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.394]
100%|██████████| 63/63 [00:07<00:00,  8.20it/s, loss=1.64]


Epoch = 7, Accuracy = 0.5570
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.345]
100%|██████████| 63/63 [00:07<00:00,  8.24it/s, loss=1.46]


Epoch = 8, Accuracy = 0.5680
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.287]
100%|██████████| 63/63 [00:07<00:00,  8.19it/s, loss=1.59]


Epoch = 9, Accuracy = 0.5420
EarlyStopping counter: 5 out of 5
Early stopping


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.626]
100%|██████████| 63/63 [00:07<00:00,  8.21it/s, loss=0.229]


Epoch = 0, Accuracy = 0.9460
Validation score improved (-inf --> 0.946). Saving model!


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.389]
100%|██████████| 63/63 [00:07<00:00,  8.21it/s, loss=0.261]


Epoch = 1, Accuracy = 0.9180
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.311]
100%|██████████| 63/63 [00:07<00:00,  8.16it/s, loss=0.295]


Epoch = 2, Accuracy = 0.8970
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.235]
100%|██████████| 63/63 [00:07<00:00,  8.22it/s, loss=0.3]


Epoch = 3, Accuracy = 0.8940
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.242]
100%|██████████| 63/63 [00:07<00:00,  8.22it/s, loss=0.405]


Epoch = 4, Accuracy = 0.8690
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.224]
100%|██████████| 63/63 [00:07<00:00,  8.18it/s, loss=0.407]


Epoch = 5, Accuracy = 0.8440
EarlyStopping counter: 5 out of 5
Early stopping


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.325]
100%|██████████| 63/63 [00:07<00:00,  8.24it/s, loss=0.121]


Epoch = 0, Accuracy = 0.9650
Validation score improved (-inf --> 0.965). Saving model!


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.237]
100%|██████████| 63/63 [00:07<00:00,  8.23it/s, loss=0.131]


Epoch = 1, Accuracy = 0.9630
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.182]
100%|██████████| 63/63 [00:07<00:00,  8.20it/s, loss=0.136]


Epoch = 2, Accuracy = 0.9590
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.203]
100%|██████████| 63/63 [00:07<00:00,  8.19it/s, loss=0.16]


Epoch = 3, Accuracy = 0.9480
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.16]
100%|██████████| 63/63 [00:07<00:00,  8.21it/s, loss=0.187]


Epoch = 4, Accuracy = 0.9350
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.171]
100%|██████████| 63/63 [00:07<00:00,  8.20it/s, loss=0.24]


Epoch = 5, Accuracy = 0.9130
EarlyStopping counter: 5 out of 5
Early stopping


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.233]
100%|██████████| 63/63 [00:07<00:00,  8.22it/s, loss=0.0767]


Epoch = 0, Accuracy = 0.9710
Validation score improved (-inf --> 0.971). Saving model!


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.172]
100%|██████████| 63/63 [00:07<00:00,  8.23it/s, loss=0.0595]


Epoch = 1, Accuracy = 0.9840
Validation score improved (0.971 --> 0.984). Saving model!


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.125]
100%|██████████| 63/63 [00:07<00:00,  8.25it/s, loss=0.0915]


Epoch = 2, Accuracy = 0.9690
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.162]
100%|██████████| 63/63 [00:07<00:00,  8.18it/s, loss=0.204]


Epoch = 3, Accuracy = 0.9230
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.17]
100%|██████████| 63/63 [00:07<00:00,  8.21it/s, loss=0.153]


Epoch = 4, Accuracy = 0.9460
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.118]
100%|██████████| 63/63 [00:07<00:00,  8.23it/s, loss=0.0991]


Epoch = 5, Accuracy = 0.9610
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.114]
100%|██████████| 63/63 [00:07<00:00,  8.20it/s, loss=0.181]


Epoch = 6, Accuracy = 0.9330
EarlyStopping counter: 5 out of 5
Early stopping


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.201]
100%|██████████| 63/63 [00:07<00:00,  8.22it/s, loss=0.054]


Epoch = 0, Accuracy = 0.9850
Validation score improved (-inf --> 0.985). Saving model!


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.105]
100%|██████████| 63/63 [00:07<00:00,  8.23it/s, loss=0.0686]


Epoch = 1, Accuracy = 0.9800
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.124]
100%|██████████| 63/63 [00:07<00:00,  8.20it/s, loss=0.0749]


Epoch = 2, Accuracy = 0.9760
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.132]
100%|██████████| 63/63 [00:07<00:00,  8.23it/s, loss=0.11]


Epoch = 3, Accuracy = 0.9640
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.122]
100%|██████████| 63/63 [00:07<00:00,  8.23it/s, loss=0.0901]


Epoch = 4, Accuracy = 0.9670
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [01:29<00:00,  1.40it/s, loss=0.108]
100%|██████████| 63/63 [00:07<00:00,  8.22it/s, loss=0.107]


Epoch = 5, Accuracy = 0.9620
EarlyStopping counter: 5 out of 5
Early stopping


100%|██████████| 125/125 [01:03<00:00,  1.95it/s, loss=1.42]
100%|██████████| 63/63 [00:05<00:00, 11.45it/s, loss=1.21]


Epoch = 0, Accuracy = 0.5430
Validation score improved (-inf --> 0.543). Saving model!


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=1.12]
100%|██████████| 63/63 [00:05<00:00, 11.47it/s, loss=1.17]


Epoch = 1, Accuracy = 0.5410
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.948]
100%|██████████| 63/63 [00:05<00:00, 11.40it/s, loss=1.19]


Epoch = 2, Accuracy = 0.5810
Validation score improved (0.543 --> 0.581). Saving model!


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.818]
100%|██████████| 63/63 [00:05<00:00, 11.44it/s, loss=1.27]


Epoch = 3, Accuracy = 0.5350
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.678]
100%|██████████| 63/63 [00:05<00:00, 11.35it/s, loss=1.32]


Epoch = 4, Accuracy = 0.5460
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.578]
100%|██████████| 63/63 [00:05<00:00, 11.27it/s, loss=1.35]


Epoch = 5, Accuracy = 0.5370
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.506]
100%|██████████| 63/63 [00:05<00:00, 11.45it/s, loss=1.4]


Epoch = 6, Accuracy = 0.5820
Validation score improved (0.581 --> 0.582). Saving model!


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.426]
100%|██████████| 63/63 [00:05<00:00, 11.42it/s, loss=1.47]


Epoch = 7, Accuracy = 0.5670
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.352]
100%|██████████| 63/63 [00:05<00:00, 11.37it/s, loss=1.5]


Epoch = 8, Accuracy = 0.5620
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.312]
100%|██████████| 63/63 [00:05<00:00, 11.45it/s, loss=1.6]


Epoch = 9, Accuracy = 0.5690
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.267]
100%|██████████| 63/63 [00:05<00:00, 11.38it/s, loss=1.72]


Epoch = 10, Accuracy = 0.5510
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.281]
100%|██████████| 63/63 [00:05<00:00, 11.44it/s, loss=1.72]


Epoch = 11, Accuracy = 0.5720
EarlyStopping counter: 5 out of 5
Early stopping


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.59]
100%|██████████| 63/63 [00:05<00:00, 11.43it/s, loss=0.221]


Epoch = 0, Accuracy = 0.9320
Validation score improved (-inf --> 0.932). Saving model!


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.376]
100%|██████████| 63/63 [00:05<00:00, 11.45it/s, loss=0.24]


Epoch = 1, Accuracy = 0.9230
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.313]
100%|██████████| 63/63 [00:05<00:00, 11.33it/s, loss=0.296]


Epoch = 2, Accuracy = 0.8880
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.259]
100%|██████████| 63/63 [00:05<00:00, 11.35it/s, loss=0.312]


Epoch = 3, Accuracy = 0.8990
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.249]
100%|██████████| 63/63 [00:05<00:00, 11.39it/s, loss=0.353]


Epoch = 4, Accuracy = 0.8730
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.206]
100%|██████████| 63/63 [00:05<00:00, 11.34it/s, loss=0.549]


Epoch = 5, Accuracy = 0.8170
EarlyStopping counter: 5 out of 5
Early stopping


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.29]
100%|██████████| 63/63 [00:05<00:00, 11.41it/s, loss=0.0704]


Epoch = 0, Accuracy = 0.9780
Validation score improved (-inf --> 0.978). Saving model!


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.195]
100%|██████████| 63/63 [00:05<00:00, 11.31it/s, loss=0.0765]


Epoch = 1, Accuracy = 0.9810
Validation score improved (0.978 --> 0.981). Saving model!


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.206]
100%|██████████| 63/63 [00:05<00:00, 11.43it/s, loss=0.105]


Epoch = 2, Accuracy = 0.9670
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.173]
100%|██████████| 63/63 [00:05<00:00, 11.46it/s, loss=0.167]


Epoch = 3, Accuracy = 0.9450
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.192]
100%|██████████| 63/63 [00:05<00:00, 11.46it/s, loss=0.166]


Epoch = 4, Accuracy = 0.9370
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.162]
100%|██████████| 63/63 [00:05<00:00, 11.46it/s, loss=0.126]


Epoch = 5, Accuracy = 0.9600
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.149]
100%|██████████| 63/63 [00:05<00:00, 11.47it/s, loss=0.205]


Epoch = 6, Accuracy = 0.9310
EarlyStopping counter: 5 out of 5
Early stopping


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.228]
100%|██████████| 63/63 [00:05<00:00, 11.32it/s, loss=0.0891]


Epoch = 0, Accuracy = 0.9680
Validation score improved (-inf --> 0.968). Saving model!


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.164]
100%|██████████| 63/63 [00:05<00:00, 11.41it/s, loss=0.0934]


Epoch = 1, Accuracy = 0.9730
Validation score improved (0.968 --> 0.973). Saving model!


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.149]
100%|██████████| 63/63 [00:05<00:00, 11.44it/s, loss=0.124]


Epoch = 2, Accuracy = 0.9530
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.97it/s, loss=0.154]
100%|██████████| 63/63 [00:05<00:00, 11.49it/s, loss=0.131]


Epoch = 3, Accuracy = 0.9590
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.142]
100%|██████████| 63/63 [00:05<00:00, 11.47it/s, loss=0.143]


Epoch = 4, Accuracy = 0.9510
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.141]
100%|██████████| 63/63 [00:05<00:00, 11.46it/s, loss=0.147]


Epoch = 5, Accuracy = 0.9500
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.97it/s, loss=0.15]
100%|██████████| 63/63 [00:05<00:00, 11.46it/s, loss=0.177]


Epoch = 6, Accuracy = 0.9430
EarlyStopping counter: 5 out of 5
Early stopping


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.198]
100%|██████████| 63/63 [00:05<00:00, 11.43it/s, loss=0.05]


Epoch = 0, Accuracy = 0.9870
Validation score improved (-inf --> 0.987). Saving model!


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.102]
100%|██████████| 63/63 [00:05<00:00, 11.38it/s, loss=0.0525]


Epoch = 1, Accuracy = 0.9850
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.97it/s, loss=0.122]
100%|██████████| 63/63 [00:05<00:00, 11.41it/s, loss=0.113]


Epoch = 2, Accuracy = 0.9670
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.123]
100%|██████████| 63/63 [00:05<00:00, 11.30it/s, loss=0.114]


Epoch = 3, Accuracy = 0.9650
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.119]
100%|██████████| 63/63 [00:05<00:00, 11.47it/s, loss=0.119]


Epoch = 4, Accuracy = 0.9550
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [01:03<00:00,  1.96it/s, loss=0.104]
100%|██████████| 63/63 [00:05<00:00, 11.42it/s, loss=0.118]


Epoch = 5, Accuracy = 0.9610
EarlyStopping counter: 5 out of 5
Early stopping


100%|██████████| 125/125 [00:21<00:00,  5.81it/s, loss=1.45]
100%|██████████| 63/63 [00:02<00:00, 26.32it/s, loss=1.36]


Epoch = 0, Accuracy = 0.4880
Validation score improved (-inf --> 0.488). Saving model!


100%|██████████| 125/125 [00:21<00:00,  5.84it/s, loss=1.11]
100%|██████████| 63/63 [00:02<00:00, 26.48it/s, loss=1.28]


Epoch = 1, Accuracy = 0.5280
Validation score improved (0.488 --> 0.528). Saving model!


100%|██████████| 125/125 [00:21<00:00,  5.88it/s, loss=0.9]
100%|██████████| 63/63 [00:02<00:00, 26.40it/s, loss=1.34]


Epoch = 2, Accuracy = 0.5260
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.84it/s, loss=0.751]
100%|██████████| 63/63 [00:02<00:00, 26.16it/s, loss=1.44]


Epoch = 3, Accuracy = 0.5160
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.85it/s, loss=0.621]
100%|██████████| 63/63 [00:02<00:00, 26.22it/s, loss=1.39]


Epoch = 4, Accuracy = 0.5420
Validation score improved (0.528 --> 0.542). Saving model!


100%|██████████| 125/125 [00:21<00:00,  5.88it/s, loss=0.516]
100%|██████████| 63/63 [00:02<00:00, 23.56it/s, loss=1.48]


Epoch = 5, Accuracy = 0.5580
Validation score improved (0.542 --> 0.558). Saving model!


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.436]
100%|██████████| 63/63 [00:02<00:00, 26.33it/s, loss=1.52]


Epoch = 6, Accuracy = 0.5560
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.35]
100%|██████████| 63/63 [00:02<00:00, 26.35it/s, loss=1.53]


Epoch = 7, Accuracy = 0.5720
Validation score improved (0.558 --> 0.572). Saving model!


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.29]
100%|██████████| 63/63 [00:02<00:00, 25.19it/s, loss=1.61]


Epoch = 8, Accuracy = 0.5560
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.25]
100%|██████████| 63/63 [00:02<00:00, 25.17it/s, loss=1.73]


Epoch = 9, Accuracy = 0.5570
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.84it/s, loss=0.229]
100%|██████████| 63/63 [00:02<00:00, 26.28it/s, loss=1.71]


Epoch = 10, Accuracy = 0.5540
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.218]
100%|██████████| 63/63 [00:02<00:00, 26.05it/s, loss=1.75]


Epoch = 11, Accuracy = 0.5800
Validation score improved (0.572 --> 0.58). Saving model!


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.171]
100%|██████████| 63/63 [00:02<00:00, 25.91it/s, loss=1.81]


Epoch = 12, Accuracy = 0.5570
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.82it/s, loss=0.177]
100%|██████████| 63/63 [00:02<00:00, 25.78it/s, loss=1.73]


Epoch = 13, Accuracy = 0.5710
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.83it/s, loss=0.164]
100%|██████████| 63/63 [00:02<00:00, 26.23it/s, loss=1.92]


Epoch = 14, Accuracy = 0.5410
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.171]
100%|██████████| 63/63 [00:02<00:00, 25.25it/s, loss=1.79]


Epoch = 15, Accuracy = 0.5870
Validation score improved (0.58 --> 0.587). Saving model!


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.14]
100%|██████████| 63/63 [00:02<00:00, 25.92it/s, loss=1.84]


Epoch = 16, Accuracy = 0.5870
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.133]
100%|██████████| 63/63 [00:02<00:00, 26.12it/s, loss=1.83]


Epoch = 17, Accuracy = 0.5730
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.82it/s, loss=0.122]
100%|██████████| 63/63 [00:02<00:00, 26.19it/s, loss=1.92]


Epoch = 18, Accuracy = 0.5740
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.87it/s, loss=0.131]
100%|██████████| 63/63 [00:02<00:00, 25.86it/s, loss=2.03]


Epoch = 19, Accuracy = 0.5700
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.85it/s, loss=0.115]
100%|██████████| 63/63 [00:02<00:00, 26.35it/s, loss=1.92]


Epoch = 20, Accuracy = 0.5650
EarlyStopping counter: 5 out of 5
Early stopping


100%|██████████| 125/125 [00:21<00:00,  5.83it/s, loss=0.56]
100%|██████████| 63/63 [00:02<00:00, 24.89it/s, loss=0.142]


Epoch = 0, Accuracy = 0.9580
Validation score improved (-inf --> 0.958). Saving model!


100%|██████████| 125/125 [00:21<00:00,  5.85it/s, loss=0.3]
100%|██████████| 63/63 [00:02<00:00, 26.42it/s, loss=0.11]


Epoch = 1, Accuracy = 0.9690
Validation score improved (0.958 --> 0.969). Saving model!


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.236]
100%|██████████| 63/63 [00:02<00:00, 25.97it/s, loss=0.14]


Epoch = 2, Accuracy = 0.9540
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.85it/s, loss=0.174]
100%|██████████| 63/63 [00:02<00:00, 26.30it/s, loss=0.141]


Epoch = 3, Accuracy = 0.9600
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.147]
100%|██████████| 63/63 [00:02<00:00, 23.80it/s, loss=0.183]


Epoch = 4, Accuracy = 0.9470
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.165]
100%|██████████| 63/63 [00:02<00:00, 25.92it/s, loss=0.265]


Epoch = 5, Accuracy = 0.9140
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.80it/s, loss=0.133]
100%|██████████| 63/63 [00:02<00:00, 26.13it/s, loss=0.192]


Epoch = 6, Accuracy = 0.9340
EarlyStopping counter: 5 out of 5
Early stopping


100%|██████████| 125/125 [00:21<00:00,  5.84it/s, loss=0.208]
100%|██████████| 63/63 [00:02<00:00, 26.42it/s, loss=0.0365]


Epoch = 0, Accuracy = 0.9880
Validation score improved (-inf --> 0.988). Saving model!


100%|██████████| 125/125 [00:21<00:00,  5.83it/s, loss=0.157]
100%|██████████| 63/63 [00:02<00:00, 26.36it/s, loss=0.0454]


Epoch = 1, Accuracy = 0.9890
Validation score improved (0.988 --> 0.989). Saving model!


100%|██████████| 125/125 [00:21<00:00,  5.87it/s, loss=0.134]
100%|██████████| 63/63 [00:02<00:00, 26.37it/s, loss=0.036]


Epoch = 2, Accuracy = 0.9890
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.85it/s, loss=0.126]
100%|██████████| 63/63 [00:02<00:00, 26.10it/s, loss=0.0585]


Epoch = 3, Accuracy = 0.9850
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.85it/s, loss=0.118]
100%|██████████| 63/63 [00:02<00:00, 26.06it/s, loss=0.0715]


Epoch = 4, Accuracy = 0.9750
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.85it/s, loss=0.112]
100%|██████████| 63/63 [00:02<00:00, 26.08it/s, loss=0.0708]


Epoch = 5, Accuracy = 0.9780
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.84it/s, loss=0.136]
100%|██████████| 63/63 [00:02<00:00, 26.12it/s, loss=0.151]


Epoch = 6, Accuracy = 0.9480
EarlyStopping counter: 5 out of 5
Early stopping


100%|██████████| 125/125 [00:21<00:00,  5.83it/s, loss=0.152]
100%|██████████| 63/63 [00:02<00:00, 26.25it/s, loss=0.0565]


Epoch = 0, Accuracy = 0.9830
Validation score improved (-inf --> 0.983). Saving model!


100%|██████████| 125/125 [00:21<00:00,  5.84it/s, loss=0.13]
100%|██████████| 63/63 [00:02<00:00, 25.08it/s, loss=0.0359]


Epoch = 1, Accuracy = 0.9900
Validation score improved (0.983 --> 0.99). Saving model!


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.117]
100%|██████████| 63/63 [00:02<00:00, 26.24it/s, loss=0.0797]


Epoch = 2, Accuracy = 0.9710
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.0969]
100%|██████████| 63/63 [00:02<00:00, 26.43it/s, loss=0.0725]


Epoch = 3, Accuracy = 0.9740
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.85it/s, loss=0.0815]
100%|██████████| 63/63 [00:02<00:00, 25.95it/s, loss=0.057]


Epoch = 4, Accuracy = 0.9780
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.107]
100%|██████████| 63/63 [00:02<00:00, 26.34it/s, loss=0.0932]


Epoch = 5, Accuracy = 0.9700
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.84it/s, loss=0.108]
100%|██████████| 63/63 [00:02<00:00, 26.18it/s, loss=0.125]


Epoch = 6, Accuracy = 0.9500
EarlyStopping counter: 5 out of 5
Early stopping


100%|██████████| 125/125 [00:21<00:00,  5.84it/s, loss=0.131]
100%|██████████| 63/63 [00:02<00:00, 24.84it/s, loss=0.0478]


Epoch = 0, Accuracy = 0.9870
Validation score improved (-inf --> 0.987). Saving model!


100%|██████████| 125/125 [00:21<00:00,  5.84it/s, loss=0.12]
100%|██████████| 63/63 [00:02<00:00, 24.58it/s, loss=0.0775]


Epoch = 1, Accuracy = 0.9710
EarlyStopping counter: 1 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.83it/s, loss=0.0917]
100%|██████████| 63/63 [00:02<00:00, 26.19it/s, loss=0.0427]


Epoch = 2, Accuracy = 0.9870
EarlyStopping counter: 2 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.84it/s, loss=0.101]
100%|██████████| 63/63 [00:02<00:00, 26.25it/s, loss=0.0674]


Epoch = 3, Accuracy = 0.9790
EarlyStopping counter: 3 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.86it/s, loss=0.076]
100%|██████████| 63/63 [00:02<00:00, 23.39it/s, loss=0.0569]


Epoch = 4, Accuracy = 0.9780
EarlyStopping counter: 4 out of 5


100%|██████████| 125/125 [00:21<00:00,  5.74it/s, loss=0.0887]
100%|██████████| 63/63 [00:02<00:00, 26.26it/s, loss=0.093]


Epoch = 5, Accuracy = 0.9670
EarlyStopping counter: 5 out of 5
Early stopping


In [14]:
import warnings
from warnings import filterwarnings
dfpred = pd.DataFrame()
for model_type, model_name in zip(model_types, model_names):
    folds = range(5)
    predictions_list = []
    for fold in folds:
        predictions = predict(fold,model_type,model_name)  # Get predictions for the current fold
        predictions_list.append(predictions)  # Store predictions
    p = np.mean(predictions_list, axis=0)
    pred = np.argmax(p, axis=1)
    dfpred[model_name] = pred

  model.load_state_dict(torch.load(model_path))  # Load the trained model weights
100%|██████████| 157/157 [00:18<00:00,  8.38it/s]
  model.load_state_dict(torch.load(model_path))  # Load the trained model weights
100%|██████████| 157/157 [00:18<00:00,  8.28it/s]
  model.load_state_dict(torch.load(model_path))  # Load the trained model weights
100%|██████████| 157/157 [00:18<00:00,  8.36it/s]
  model.load_state_dict(torch.load(model_path))  # Load the trained model weights
100%|██████████| 157/157 [00:18<00:00,  8.40it/s]
  model.load_state_dict(torch.load(model_path))  # Load the trained model weights
100%|██████████| 157/157 [00:18<00:00,  8.40it/s]
  model.load_state_dict(torch.load(model_path))  # Load the trained model weights
100%|██████████| 157/157 [00:13<00:00, 11.74it/s]
  model.load_state_dict(torch.load(model_path))  # Load the trained model weights
100%|██████████| 157/157 [00:13<00:00, 11.70it/s]
  model.load_state_dict(torch.load(model_path))  # Load the trained model we

In [15]:
#  3 Models Ensembled by taking mode
from scipy import stats
def calculate_mode_for_rows(df):
    def row_mode(row):
        # Check if col1 is 1, then mode should be 1
        if row['RES152'] == 1:
            return 1
        
        # Create a list of values to calculate mode
        values = [row['RES152'], row['RES101'], row['RES32D']]
        
        # Calculate mode
        mode = pd.Series(values).mode()
        
        # Return mode if it exists
        if not mode.empty:
            return mode.iloc[0]
        else:
            return None

    # Apply the row_mode function to each row
    df['mode'] = df.apply(row_mode, axis=1)
    return df

In [16]:
dfpred = calculate_mode_for_rows(dfpred)
dfsy = pd.read_csv('/kaggle/input/emo-map-challenge/sample_submission.csv')
dfsy['emotion']=dfpred['mode']
dfsy.to_csv('finalresult.csv',index = False)

# section 2
Class imbalance is the problem that need attention, so I have tried various loss functions,but they gave almost same accuracy, here are the loss functions tried

## Dynamic attention loss

In [17]:
def compute_confusion_matrix(predictions, targets, num_classes, device):
    confusion_matrix = torch.zeros(num_classes, num_classes, device=device)
    pred_labels = predictions.argmax(dim=1)
    for t, p in zip(targets, pred_labels):
        confusion_matrix[t, p] += 1
    confusion_matrix = confusion_matrix / confusion_matrix.sum(dim=1, keepdim=True)
    confusion_matrix[confusion_matrix != confusion_matrix] = 0 
    return confusion_matrix
class DynamicAttentionLoss(nn.Module):
    def __init__(self, alpha=0.5, num_classes=10):
        super(DynamicAttentionLoss, self).__init__()
        self.alpha = alpha 
        self.num_classes = num_classes 
    def forward(self, predictions, targets, confusion_matrix):
        one_hot_labels = F.one_hot(targets, num_classes=self.num_classes).float()
        soft_labels = (1 - self.alpha) * one_hot_labels + self.alpha * confusion_matrix[targets]
        log_probs = F.log_softmax(predictions, dim=1)
        loss = -torch.sum(soft_labels * log_probs, dim=1)
        return loss.mean()

## Focal Loss

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = 1e-12  # prevent training from NaN-loss errors

    def forward(self, logits, target):
        """
        logits: [batch_size, num_classes]
        target: [batch_size] (true class indices, not one-hot)
        """
        # Compute softmax over logits to get the class probabilities
        probs = F.softmax(logits, dim=1)
        
        # Gather the probabilities of the true class (index-based selection)
        probs_target_class = probs.gather(1, target.unsqueeze(1)).squeeze(1) + self.epsilon
        
        # Compute the log of the selected probabilities
        log_pt = torch.log(probs_target_class)
        
        # Calculate the focal loss (focuses more on hard examples)
        focal_loss = -1 * self.alpha * (1 - probs_target_class) ** self.gamma * log_pt
        
        return torch.mean(focal_loss)

## Centre Loss

This Loss function mainly focuses on discriminative features where the data is less for imbalanced classes 

In [19]:
class CenterLoss(nn.Module):
    """Center loss.
    
    Reference:
    Wen et al. A Discriminative Feature Learning Approach for Deep Face Recognition. ECCV 2016.
    
    Args:
        num_classes (int): number of classes.
        feat_dim (int): feature dimension.
    """
    def __init__(self, num_classes=7, feat_dim=2048, use_gpu=True):
        super(CenterLoss, self).__init__()
        self.num_classes = num_classes
        self.feat_dim = feat_dim
        self.use_gpu = use_gpu

        if self.use_gpu:
            self.centers = nn.Parameter(torch.randn(self.num_classes, self.feat_dim).cuda())
        else:
            self.centers = nn.Parameter(torch.randn(self.num_classes, self.feat_dim))

    def forward(self, x, labels):
        """
        Args:
            x: feature matrix with shape (batch_size, feat_dim).
            labels: ground truth labels with shape (batch_size).
        """
        batch_size = x.size(0)
        distmat = torch.pow(x, 2).sum(dim=1, keepdim=True).expand(batch_size, self.num_classes) + \
                  torch.pow(self.centers, 2).sum(dim=1, keepdim=True).expand(self.num_classes, batch_size).t()
        distmat.addmm_(1, -2, x, self.centers.t())

        classes = torch.arange(self.num_classes).long()
        if self.use_gpu: classes = classes.cuda()
        labels = labels.unsqueeze(1).expand(batch_size, self.num_classes)
        mask = labels.eq(classes.expand(batch_size, self.num_classes))

        dist = distmat * mask.float()
        loss = dist.clamp(min=1e-12, max=1e+12).sum() / batch_size

        return loss

In order to capitalise on Centre loss one should extract the deep features which are useful to descriminate the features, here is the typical fextraction of deep features to train the model,


In [20]:

__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152']


model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}


def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = conv1x1(inplanes, planes)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = conv3x3(planes, planes, stride)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = conv1x1(planes, planes * self.expansion)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False):
        super(ResNet, self).__init__()
        self.inplanes = 64
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x,targets = None):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        
        # To save pre-final layer's features

        feat = self.avgpool(self.layer4(x))
        feat = feat.view(feat.size(0), -1)
        x = self.fc(feat)
        if targets is not None:
            loss = self.loss_fn(x,targets)
            return x, loss
        
        return feat, x    #Extracted deep-features represented as 'feat'


def resnet101(pretrained=False, **kwargs):
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
    return model