In [1]:
from datasets import load_dataset

ds = load_dataset("clip-benchmark/wds_fer2013")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train/0.tar:   0%|          | 0.00/39.2M [00:00<?, ?B/s]

train/1.tar:   0%|          | 0.00/39.2M [00:00<?, ?B/s]

train/2.tar:   0%|          | 0.00/38.9M [00:00<?, ?B/s]

train/3.tar:   0%|          | 0.00/38.7M [00:00<?, ?B/s]

test/0.tar:   0%|          | 0.00/9.83M [00:00<?, ?B/s]

test/1.tar:   0%|          | 0.00/9.79M [00:00<?, ?B/s]

test/2.tar:   0%|          | 0.00/9.76M [00:00<?, ?B/s]

test/3.tar:   0%|          | 0.00/9.67M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/28709 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7178 [00:00<?, ? examples/s]

In [2]:
print(ds)
# Recheck strcture of datasets

DatasetDict({
    train: Dataset({
        features: ['__key__', '__url__', 'cls', 'jpg'],
        num_rows: 28709
    })
    test: Dataset({
        features: ['__key__', '__url__', 'cls', 'jpg'],
        num_rows: 7178
    })
})


In [3]:
print(ds['train'][0])

{'__key__': 's0000000', '__url__': '/root/.cache/huggingface/hub/datasets--clip-benchmark--wds_fer2013/snapshots/9399b94167523fe5c40b3a857e24ef931ee4395b/train/0.tar', 'cls': 0, 'jpg': <PIL.JpegImagePlugin.JpegImageFile image mode=L size=48x48 at 0x7841CA754740>}


In [4]:
list_emotion = dict({
    0: 'anger',
    1: 'disgust',
    2: 'fear',
    3: 'happiness',
    4: 'neutral',
    5: 'sad',
    6: 'surprised'
})

In [5]:
import torch
from torchvision import transforms
from torch.utils.data import DataLoader
from tqdm import tqdm

In [15]:
# create composition list that each image have to do before training
train_transform = transforms.Compose([
    transforms.Resize((48,48)), # resize each image to 48*48
    transforms.Grayscale(num_output_channels=1), # make image to grayscale and use channel only 1 channel to reduce using memory
    transforms.RandomHorizontalFlip(p=0.5), # Add augmentation
    # (1,48,48) is data for 1 image but if we change 1 to 3 data will be (3,48,48) it make the size is increase 3 times
    # 3 come from number of channel for RGB but we use 1 because gray scale can use only 1 channel
    transforms.ToTensor(), # change image to number tensor type for making computer understand
    transforms.Normalize((0.5,), (0.5,)) # normalize value to make sure preventing value fluctuations allows the AI ​​to learn faster.
])
test_transform = transforms.Compose([
    transforms.Resize((48,48)),
    transforms.Grayscale(num_output_channels=1),
    transforms.ToTensor(),
    transforms.Normalize((0.5,),(0.5,))
])

In [7]:
def apply_transformer(batch):
  pixel_val = [train_transform(img.convert("L")) for img in batch['jpg']]
  labels = batch['cls']
  return {
      'pixel_values': pixel_val,
      'labels': labels
  }

In [17]:
def apply_test_transform(batch):
  pixel_val = [test_transform(img.convert("L")) for img in batch['jpg']]
  labels = batch['cls']
  return {
      'pixel_values': pixel_val,
      'labels': labels
  }

In [8]:
encode_dataset = ds.with_transform(apply_transformer)

In [9]:
print(encode_dataset['train'][0])

{'pixel_values': tensor([[[-0.6078, -0.7490, -0.8824,  ...,  0.0431,  0.1843, -0.3255],
         [-0.5529, -0.7333, -0.8275,  ...,  0.0824,  0.1843, -0.3020],
         [-0.5216, -0.7647, -0.8118,  ...,  0.1137,  0.1686, -0.3020],
         ...,
         [-0.1922, -0.2157, -0.2157,  ...,  0.1686, -0.1843, -0.3333],
         [-0.1608, -0.1294, -0.1137,  ...,  0.1843, -0.0588, -0.3255],
         [-0.1843, -0.1843, -0.1216,  ...,  0.1216,  0.0667, -0.3490]]]), 'labels': 0}


In [10]:
train_loader = DataLoader(encode_dataset['train'], batch_size=64, shuffle=True)
test_loader = DataLoader(encode_dataset['test'], batch_size=64)
# batch_size = 64 is value that ai will read number of image equal that value and then read answer for checking
# shuffle to ensure training data not be a cluster

In [11]:
loader_iter = iter(train_loader)
for i in range(5):
    try:
        batch = next(loader_iter)
        print(f"Batch {i+1} Labels shape:", batch['labels'].shape)
        print(f"Batch {i+1} Labels:", batch['labels'][:10]) # see on first ten value
        # each round will gave 64 image and labels 64 labels
    except StopIteration:
        break

Batch 1 Labels shape: torch.Size([64])
Batch 1 Labels: tensor([5, 4, 2, 4, 6, 3, 3, 6, 5, 0])
Batch 2 Labels shape: torch.Size([64])
Batch 2 Labels: tensor([3, 5, 0, 3, 5, 0, 5, 2, 5, 0])
Batch 3 Labels shape: torch.Size([64])
Batch 3 Labels: tensor([3, 3, 0, 3, 6, 0, 3, 4, 5, 3])
Batch 4 Labels shape: torch.Size([64])
Batch 4 Labels: tensor([3, 0, 4, 2, 3, 0, 5, 5, 0, 2])
Batch 5 Labels shape: torch.Size([64])
Batch 5 Labels: tensor([5, 2, 0, 2, 5, 0, 2, 3, 6, 2])


In [12]:
import torch.nn as nn
import torch.nn.functional as F

In [13]:
class EmotionCNN(nn.Module):
  def __init__(self, num_class=7):
    super(EmotionCNN, self).__init__()
    self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
    # if data are a text or plain one way data such as text or Audio(signal) have to use Conv1d, if data is image have to use Conv2d because data can be readed x and y at the same time
    # if data contain components of width, high, time such as video have to use Conv3d
    # kernel size is frame that ai use to move 3x3 pixel on the image and move it one by one through layer to creating output from layer
    # The less attention one pays to small details, the more interested one becomes.
    # padding is adding value 0 around the image to prevent value close the edge
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
    # pool is used to decompose image from 48x48 to 24x24 by using parameter kernel_size = 2 for the size of small pictures and stride = 2 to move two by two and compose these picture to
    # one picture that size is 24x24 so
    self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
    self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
    self.bn1 = nn.BatchNorm2d(32)
    self.bn2 = nn.BatchNorm2d(64)
    self.bn3 = nn.BatchNorm2d(128)
    # make output from each layer equaly distributed (std = 1) and mean = 0
    self.fc1 = nn.Linear(in_features=128*6*6, out_features=512)
    # the reason using 128*6*6 because input's shape is (128,6,6), 6 come from after through each layer have to decompose by using pool 48->24->12->6
    self.fc2 = nn.Linear(in_features=512, out_features=num_class)
    # use fc for connect all input
    # use Linear instead other because Linear can prevent exploading value and make ai easy for doing backpropagation

  def forward(self, x):
    x = self.pool(F.relu(self.bn1(self.conv1(x))))
    x = self.pool(F.relu(self.bn2(self.conv2(x))))
    x = self.pool(F.relu(self.bn3(self.conv3(x))))
    # use relu to be activation function because it is standard for deep learning. and activation function make ai can learn non-linear relation
    x = x.view(-1, 128*6*6)
    # view using for making reshape data for each image, each row size is 128*6*6 and because we don't know number of row to make total batch we will use -1 to let pytorch find value for this parameter
    x = F.relu(self.fc1(x))
    x = F.dropout(x, p=0.5, training=self.training)
    # dropout is function to choose which node have to remove from calculation
    # x is input that pass throug fucntion dropout, p is "probability" (chance) for zeroing out an element. (e.g., p=0.5 means 50% chance to be set to 0).
    # training using to check that this round have to use dropout function or not when pass to this function, if model.training() it will use dropout otherwise it not use
    x = self.fc2(x)
    return x

In [14]:
# set up Phase
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
print(device)
model = EmotionCNN(num_class=7).to(device)

criteria = nn.CrossEntropyLoss()
# use crosEntroypee to be loss function for this model and it's approprate to using in ai that have to classififer

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Adam is upgraded version of gradient descent, ai will create parameter every learning and let Adam responsesible to change value for each parameter
# lr is learning rate, high value make faster learning but make loss value expload, low value make slow learning but more accuracy


cuda


In [21]:
# traning phase
num_of_training = 20
for round in range(num_of_training):
  model.train()
  runing_loss = 0.0
  loop = tqdm(train_loader)
  for batch in loop:
    images = batch['pixel_values'].to(device)
    labels = batch['labels'].to(device)
    optimizer.zero_grad()
    # remove slope(gradient) value from previous training to prevent expload value when do backpropagation
    output = model(images)
    loss = criteria(output,labels)
    loss.backward()
    optimizer.step()
    runing_loss += loss.item()

    loop.set_description(f"Epoch [{round+1}/{num_of_training}]")

    # set_postfix: ใส่ข้อมูลด้านหลัง (เช่น ค่า Loss ล่าสุด)
    # มันจะอัปเดตตัวเลข real-time โดยไม่ต้อง print บรรทัดใหม่ให้รกจอ
    loop.set_postfix(loss=loss.item())

Epoch [1/20]: 100%|██████████| 449/449 [00:15<00:00, 29.26it/s, loss=0.764]
Epoch [2/20]: 100%|██████████| 449/449 [00:16<00:00, 27.59it/s, loss=0.498]
Epoch [3/20]: 100%|██████████| 449/449 [00:15<00:00, 28.38it/s, loss=0.543]
Epoch [4/20]: 100%|██████████| 449/449 [00:17<00:00, 25.62it/s, loss=0.407]
Epoch [5/20]: 100%|██████████| 449/449 [00:15<00:00, 28.90it/s, loss=0.749]
Epoch [6/20]: 100%|██████████| 449/449 [00:15<00:00, 28.66it/s, loss=0.744]
Epoch [7/20]: 100%|██████████| 449/449 [00:15<00:00, 29.18it/s, loss=0.743]
Epoch [8/20]: 100%|██████████| 449/449 [00:15<00:00, 29.13it/s, loss=1.02]
Epoch [9/20]: 100%|██████████| 449/449 [00:15<00:00, 28.84it/s, loss=0.378]
Epoch [10/20]: 100%|██████████| 449/449 [00:16<00:00, 26.85it/s, loss=0.429]
Epoch [11/20]: 100%|██████████| 449/449 [00:15<00:00, 29.09it/s, loss=0.535]
Epoch [12/20]: 100%|██████████| 449/449 [00:15<00:00, 28.50it/s, loss=0.808]
Epoch [13/20]: 100%|██████████| 449/449 [00:16<00:00, 27.46it/s, loss=0.728]
Epoch [14

In [22]:
encode_testset = ds.with_transform(apply_test_transform)
test_loader = DataLoader(encode_testset['test'], batch_size=64)

In [23]:
def evaluate(model, loader):
  model.eval()
  correct = 0
  total = 0
  loop = tqdm(loader)
  with torch.no_grad():
    for batch in loop:
      images = batch['pixel_values'].to(device)
      labels = batch['labels'].to(device)

      outputs = model(images)
      # such as outputs = [0.1,0.8,0.2]
      max_val, predicted = torch.max(outputs.data,dim=1)
      # dim is dimension for let pytorch find max value from column (0) or row (1)
      # torch.max return two value, first is max value, second is index. index is a label for each image
      total += labels.size(0)
      # return length of first row (have only one row)
      correct += (predicted == labels).sum().item()

      loop.set_postfix(correct=correct, total=total)
  acc = 100 * correct / total
  print(f'Accuracy on test set: {acc:.4f}%')
  # when evaluate don't have to calculate grediant (change gradient is using for adjust model)

In [24]:
evaluate(model,test_loader)

100%|██████████| 113/113 [00:02<00:00, 45.93it/s, correct=4247, total=7178]

Accuracy on test set: 59.1669%





**Overfitting model**  
Because loss rate of training is only 0.165 but accuracy rate is only 56%, I fix it by using dropout function and batch normalization and adding augmentation in transform

In [25]:
torch.save(model.state_dict(), 'emotion_model.pth')
print("Save successfully")

Save successfully
