In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [31]:
!unzip "/content/drive/My Drive/internship_data_cleaned.zip"

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/internship_data_cleaned/internship_data/male/181000.jpg  
  inflating: /content/internship_data_cleaned/internship_data/male/181003.jpg  
  inflating: /content/internship_data_cleaned/internship_data/male/181004.jpg  
  inflating: /content/internship_data_cleaned/internship_data/male/181012.jpg  
  inflating: /content/internship_data_cleaned/internship_data/male/181015.jpg  
  inflating: /content/internship_data_cleaned/internship_data/male/181016.jpg  
  inflating: /content/internship_data_cleaned/internship_data/male/181018.jpg  
  inflating: /content/internship_data_cleaned/internship_data/male/181019.jpg  
  inflating: /content/internship_data_cleaned/internship_data/male/181020.jpg  
  inflating: /content/internship_data_cleaned/internship_data/male/181023.jpg  
  inflating: /content/internship_data_cleaned/internship_data/male/181028.jpg  
  inflating: /content/internship_data_cleaned/internshi

Analysis of the dataset revealed the following problems:
- presence of duplicates
- class mismatch
- lack of informative signs
- impossibility of visual identification

The FSLint program was used to remove duplicates.
The rest of the problems were partially solved in manual mode.
To fully solve these problems, it is necessary to use machine learning methods.

In [1]:
from glob import glob

data_path = 'internship_data_cleaned/'
total_count = len(glob(data_path + '*/*'))
batch = 16 # Experiments have shown that a small batch is more preferable
workers = 4

# Divide the dataset in ration 0.9/0.05/0.05. 
# We need a validation dataset to evaluate overfitting
train_count = int(0.9 * total_count)
valid_count = int(0.05 * total_count)
test_count = total_count - train_count - valid_count
print('''train_count: {} 
valid_count: {}
test_count:  {}'''.format(train_count, valid_count, test_count))

train_count: 89679 
valid_count: 4982
test_count:  4983


For validation and testing we will resize and normalize dataset. Large image size is not required therefore we use a 128 by 128 image. For training we will use additional transformations. Random rotation and horizontal flip will diversify training data.

In [2]:
from torch.utils.data import random_split, DataLoader
from torchvision.datasets import ImageFolder
from torchvision import transforms
from torch import cuda

data_transforms = {'train': transforms.Compose([
                            transforms.Resize([128, 128]),
                            transforms.RandomHorizontalFlip(),
                            transforms.RandomRotation(degrees=15),
                            transforms.ToTensor(),
                            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                 std=[0.229, 0.224, 0.225])]),
                   'test':  transforms.Compose([
                            transforms.Resize([128, 128]),
                            transforms.ToTensor(),
                            transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                                 std=[0.229, 0.224, 0.225])])}

# Apply different transformations in two stages
train_dataset = ImageFolder(data_path, transform=data_transforms['train'])
train_dataset, _ = random_split(train_dataset, (train_count, 
                                                valid_count + \
                                                test_count))

test_dataset = ImageFolder(data_path, transform=data_transforms['test'])
_, valid_dataset, test_dataset = random_split(test_dataset, (train_count, 
                                                             valid_count, 
                                                             test_count))

# Create three dataloaders. We don't need to shuffle the test dataset
train_dataset_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True, 
                                  num_workers=workers)  
valid_dataset_loader = DataLoader(valid_dataset, batch_size=batch, shuffle=True, 
                                  num_workers=workers) 
test_dataset_loader  = DataLoader(test_dataset, batch_size=batch, shuffle=False,
                                  num_workers=workers)

dataloaders = {'train': train_dataset_loader, 
               'valid': valid_dataset_loader, 
               'test': test_dataset_loader}

# We will use GPU for fast learning
use_cuda = cuda.is_available()
print('\nuse_cuda: ', use_cuda)


use_cuda:  True


The assigned task is a classification task. We will use CNN architecture. Since it is necessary to define only two classes, max poolling layers will reduce the number of parameters without unnecessary loss of information. Using Dropout layers will reduce the impact of overfitting during long training. At the output, we use the log_softmax function to apply the NLLLoss loss function.

In [3]:
from torch.nn import Module, Conv2d, MaxPool2d, Linear, Dropout
from torch.nn.functional import relu, log_softmax

class Net(Module):
    def __init__(self):
        super(Net, self).__init__()
        # 128x128x3
        self.conv1 = Conv2d(3, 32, kernel_size=3, stride=1, padding=0)
        # 126x126x32
        self.conv2 = Conv2d(32, 32, kernel_size=3, stride=1, padding=0)
        # 124x124x32
        self.max_pool1 = MaxPool2d(2, 2)
        
        # 62x62x32
        self.conv3 = Conv2d(32, 32, kernel_size=3, stride=1, padding=0)
        # 60x60x32
        self.conv4 = Conv2d(32, 32, kernel_size=3, stride=1, padding=0)
        # 58x58x32
        self.max_pool2 = MaxPool2d(2, 2)
         
        # 29x29x32
        self.conv5 = Conv2d(32, 64, kernel_size=3, stride=1, padding=0)
        # 27x27x64
        self.conv6 = Conv2d(64, 64, kernel_size=3, stride=1, padding=0)
        # 25x25x64
        self.max_pool3 = MaxPool2d(2, 2)

        # 13x13x64
        self.conv7 = Conv2d(64, 64, kernel_size=3, stride=1, padding=0)
        # 11x11x64
        self.conv8 = Conv2d(64, 64, kernel_size=3, stride=1, padding=0)
        # 9x9x64
        self.max_pool4 = MaxPool2d(2, 2)

        # 4x4x64
        self.fc1 = Linear(4*4*64, 512)        
        self.drop = Dropout(0.2)
        self.fc2 = Linear(512, 2)
        
    def forward(self, x):
        ## Define forward behavior
        x = relu(self.conv1(x))
        x = relu(self.conv2(x))
        x = self.max_pool1(x)
            
        x = relu(self.conv3(x))
        x = relu(self.conv4(x))
        x = self.max_pool2(x)

        x = relu(self.conv5(x))
        x = relu(self.conv6(x))
        x = self.max_pool3(x)

        x = relu(self.conv7(x))
        x = relu(self.conv8(x))
        x = self.max_pool4(x)
             
        x = x.view(x.size(0), -1)
        x = self.drop(x)
        x = self.fc1(x)
        x = self.drop(x)        
        x = self.fc2(x)
        x = log_softmax(x, -1)
            
        return x

In [4]:
model = Net()
print(model)

if use_cuda:
    model.cuda()

Net(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (max_pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv4): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (max_pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv6): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (max_pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv8): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (max_pool4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=1024, out_features=512, bias=True)
  (drop): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=512, out_features

After several experiments it was found that NLLLoss is more suitable than CrossEntropy function for this problem. It gives a more accuracy. Adam was chosen as the learning algorithm. it's stable and does a fairly good job of finding a more or less optimal solution. experiments have shown that the learning rate 2e-4 is optimal for getting out of local minima.

In [6]:
from torch.optim import Adam
from torch.nn import NLLLoss

criterion = NLLLoss()
optimizer = Adam(model.parameters(), lr=2e-4)

In [7]:
import numpy as np
from torch import save

def train(n_epochs, loaders, model, optimizer, criterion, use_cuda, save_path):
    """returns trained model"""
    # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf 
    
    for epoch in range(1, n_epochs+1):
        # initialize variables to monitor training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        
        # train the model
        model.train()
        for batch_idx, (data, target) in enumerate(loaders['train']):
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            
            # zero the parameter gradients
            optimizer.zero_grad()
            # calculate the loss
            loss = criterion(model(data), target)
            # compute gradient of the loss
            loss.backward()
            # parameters update
            optimizer.step()
            # update running training loss
            train_loss += (loss.data - train_loss) / (batch_idx + 1)
            if batch_idx % 100 == 0:
                print('train_loss: {:.6f}'.format(train_loss))
        
        # average training loss
        train_loss /= len(loaders['train'].dataset) 
          

        # validate the model
        model.eval()
        for batch_idx, (data, target) in enumerate(loaders['valid']):
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            # calculate the loss
            loss = criterion(model(data), target)
            
            valid_loss += (loss.data - valid_loss) / (batch_idx + 1)
            
        # average validation loss
        valid_loss /= len(loaders['valid'].dataset)
        
        # print training/validation statistics 
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'
              .format(epoch, train_loss, valid_loss))
        
        # save the model if it's better
        if valid_loss_min > valid_loss:
            save(model.state_dict(), save_path)
            print('Model saved')
            valid_loss_min = valid_loss

    return model

In [None]:
epochs = 100 # We will track overfitting during training
model_path = 'model.pt'
model = train(epochs, dataloaders, model, optimizer, criterion, use_cuda, 
              model_path)

train_loss: 0.687736
train_loss: 0.665375
train_loss: 0.632104
train_loss: 0.596885
train_loss: 0.572926
train_loss: 0.543777
train_loss: 0.521163
train_loss: 0.502668
train_loss: 0.485537
train_loss: 0.468432
train_loss: 0.452945
train_loss: 0.440768
train_loss: 0.428779
train_loss: 0.419689
train_loss: 0.411320
train_loss: 0.400857
train_loss: 0.392831
train_loss: 0.384791
train_loss: 0.376574
train_loss: 0.370265
train_loss: 0.364493
train_loss: 0.357418
train_loss: 0.352599
train_loss: 0.346620
train_loss: 0.340983
train_loss: 0.337823
train_loss: 0.332227
train_loss: 0.327543
train_loss: 0.322873
train_loss: 0.318266
train_loss: 0.313781
train_loss: 0.310404
train_loss: 0.306967
train_loss: 0.303773
train_loss: 0.300309
train_loss: 0.297245
train_loss: 0.294307
train_loss: 0.291013
train_loss: 0.288692
train_loss: 0.285780
train_loss: 0.283879
train_loss: 0.281450
train_loss: 0.279273
train_loss: 0.277214
train_loss: 0.274964
train_loss: 0.272810
train_loss: 0.270367
train_loss: 0

Now let's test the model

In [10]:
from torch import load, device

# Check location for model
if use_cuda:
    location = lambda storage, loc: storage.cuda()
else:
    location = 'cpu'

# Load the model to the device used
model.load_state_dict(load(model_path, map_location=location))

<All keys matched successfully>

In [21]:
def test(loaders, model, criterion, use_cuda):
    # monitor test loss and accuracy
    test_loss = 0.
    correct = 0.
    total = 0.

    model.eval()
    for batch_idx, (data, target) in enumerate(loaders['test']):
        # move to GPU
        if use_cuda:
            data, target = data.cuda(), target.cuda()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target)
        # update average test loss
        test_loss += ((1 / (batch_idx + 1)) * (loss.data - test_loss))
        # convert output probabilities to predicted class
        pred = output.data.max(1, keepdim=True)[1]
        # compare predictions to true label
        correct += \
            np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
        total += data.size(0)
            
    print('Test Loss: {:.6f}\n'.format(test_loss))

    print('\nTest Accuracy: %2d%% (%2d/%2d)' % (
        100. * correct / total, correct, total))

In [12]:
test(dataloaders, model, criterion, use_cuda)

Test Loss: 0.038978


Test Accuracy: 98% (4928/5001)
