### Preparation

In [1]:
# !pip install facenet_pytorch
!pip install einops

Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.7.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os
root_dir = "/content/drive/MyDrive" # Set appropriate directory
os.chdir(root_dir)

Mounted at /content/drive


In [3]:
!unzip /content/drive/MyDrive/data.zip -d /content

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/data/train/F0815/MID3/P08616_face2.jpg  
  inflating: /content/__MACOSX/data/train/F0815/MID3/._P08616_face2.jpg  
  inflating: /content/data/train/F0029/MID6/P00295_face4.jpg  
  inflating: /content/__MACOSX/data/train/F0029/MID6/._P00295_face4.jpg  
  inflating: /content/data/train/F0029/MID6/P00293_face4.jpg  
  inflating: /content/__MACOSX/data/train/F0029/MID6/._P00293_face4.jpg  
  inflating: /content/data/train/F0029/MID6/P00294_face4.jpg  
  inflating: /content/__MACOSX/data/train/F0029/MID6/._P00294_face4.jpg  
  inflating: /content/data/train/F0029/MID6/P00296_face1.jpg  
  inflating: /content/__MACOSX/data/train/F0029/MID6/._P00296_face1.jpg  
  inflating: /content/data/train/F0029/MID1/P00294_face5.jpg  
  inflating: /content/__MACOSX/data/train/F0029/MID1/._P00294_face5.jpg  
  inflating: /content/data/train/F0029/MID1/P00290_face2.jpg  
  inflating: /content/__MACOSX/data/train/F0029/MI

In [4]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from glob import glob
from collections import defaultdict
import torchvision
from random import choice
from torch.utils.data import Dataset
from PIL import Image
from torch.utils.data import DataLoader
from torchvision import transforms
import torch.nn.functional as F
from einops import rearrange, repeat
from torch.optim.lr_scheduler import ReduceLROnPlateau
# from torch.nn import Parameter
# from facenet_pytorch import InceptionResnetV1

### Config

In [5]:
class Config:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_file_path = "./data/train-relationships/train_relationships.csv"
    train_folders_path = "/content/data/train/"
    val_famillies = "F09"
    test_relationship_file = "/content/data/submissions/sample_submission.csv"
    batch_size = 64
    number_of_epochs = 100

    learning_rate = 0.0005

    MIN_NUM_PATCHES = 16
    pretrained_vits = '/content/drive/MyDrive/face_transformer/Backbone_VITs_Epoch_2_Batch_12000_Time_2021-03-17-04-05_checkpoint.pth'
    pretrained_vit = '/content/drive/MyDrive/face_transformer/Backbone_VIT_Epoch_2_Batch_20000_Time_2021-01-12-16-48_checkpoint.pth'

### Dataset

In [6]:
class KinDataset(Dataset):
    def __init__(self, relations, person_to_images_map, transform=None):
        self.relations = relations
        self.transform = transform
        self.person_to_images_map = person_to_images_map
        self.ppl = list(person_to_images_map.keys())

    def __len__(self):
        return len(self.relations)*2

    def __getitem__(self, idx):

        if idx%2==0: #Positive samples
            p1, p2 = self.relations[idx//2]
            label = 1
        else:          #TODO: better way to sample Negative samples
            while True:
                p1 = choice(self.ppl)
                p2 = choice(self.ppl)
                if p1 != p2 and (p1, p2) not in self.relations and (p2, p1) not in self.relations:
                    break
            label = 0

        path1, path2 = choice(self.person_to_images_map[p1]), choice(self.person_to_images_map[p2])
        img1, img2 = Image.open(path1), Image.open(path2)

        if self.transform:
            img1, img2 = self.transform(img1), self.transform(img2)

        return img1, img2, label

In [7]:
print("Prepare data...")
all_images = glob(Config.train_folders_path + "*/*/*.jpg")

train_images = [x for x in all_images if Config.val_famillies not in x]
val_images = [x for x in all_images if Config.val_famillies in x]

train_person_to_images_map = defaultdict(list)

ppl = [x.split("/")[-3] + "/" + x.split("/")[-2] for x in all_images]

for x in train_images:
    train_person_to_images_map[x.split("/")[-3] + "/" + x.split("/")[-2]].append(x)

val_person_to_images_map = defaultdict(list)

for x in val_images:
    val_person_to_images_map[x.split("/")[-3] + "/" + x.split("/")[-2]].append(x)

relationships = pd.read_csv(Config.train_file_path)
relationships = list(zip(relationships.p1.values, relationships.p2.values))
relationships = [x for x in relationships if x[0] in ppl and x[1] in ppl]

train_relations = [x for x in relationships if Config.val_famillies not in x[0]]
val_relations  = [x for x in relationships if Config.val_famillies in x[0]]

train_transform = transforms.Compose([
    transforms.Resize(130),
    transforms.CenterCrop(112),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
    transforms.RandomRotation(degrees=10),
    transforms.ToTensor(),
    transforms.Normalize([0., 0., 0.], [1/255., 1/255., 1/255.]),
])
val_transform = transforms.Compose([
    transforms.Resize(130),
    transforms.CenterCrop(112),
    transforms.ToTensor(),
    transforms.Normalize([0., 0., 0.], [1/255., 1/255., 1/255.]),
])

trainset = KinDataset(train_relations, train_person_to_images_map, train_transform)
valset = KinDataset(val_relations, val_person_to_images_map, val_transform)

trainloader = DataLoader(trainset, batch_size=Config.batch_size, shuffle=True)
valloader = DataLoader(valset, batch_size=Config.batch_size, shuffle=False)

Prepare data...


### Model

In [8]:
class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x

class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()
        inner_dim = dim_head *  heads
        self.heads = heads
        self.scale = dim ** -0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        self.linear_proj = nn.Linear(dim, inner_dim)
        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x, x2=None, alpha=0.8, mask=None):
        b, n, _, h = *x.shape, self.heads
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
        qx2 = self.linear_proj(x2)
        qx2 = rearrange(qx2, 'b n (h d) -> b h n d', h = h)
        q = q * alpha + qx2 * (1-alpha)
        dots = torch.einsum('bhid,bhjd->bhij', q, k) * self.scale
        mask_value = -torch.finfo(dots.dtype).max
        #embed()
        if mask is not None:
            mask = F.pad(mask.flatten(1), (1, 0), value = True)
            assert mask.shape[-1] == dots.shape[-1], 'mask has incorrect dimensions'
            mask = mask[:, None, :] * mask[:, :, None]
            dots.masked_fill_(~mask, mask_value)
            del mask

        attn = dots.softmax(dim=-1)

        out = torch.einsum('bhij,bhjd->bhid', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        out =  self.to_out(out)

        return out

class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout):
        super().__init__()
        self.layers = nn.ModuleList([])

        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Residual(PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout))),
                Residual(PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout)))
            ]))
    def forward(self, x, x2, mask = None):
        for attn, ff in self.layers:
            x = attn(x, x2=x2)
            x = ff(x)
        return x

class ViTs_face(nn.Module):
    def __init__(self, *, loss_type, GPU_ID, num_class, image_size, patch_size, ac_patch_size,
                         pad, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
        num_patches = (image_size // patch_size) ** 2
        patch_dim = channels * ac_patch_size ** 2
        assert num_patches > Config.MIN_NUM_PATCHES, f'your number of patches ({num_patches}) is way too small for attention to be effective (at least 16). Try decreasing your patch size'
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.patch_size = patch_size
        self.soft_split = nn.Unfold(kernel_size=(ac_patch_size, ac_patch_size), stride=(self.patch_size, self.patch_size), padding=(pad, pad))


        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.patch_to_embedding = nn.Linear(patch_dim, dim)
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
        )
        self.loss_type = loss_type
        self.GPU_ID = GPU_ID
        if self.loss_type == 'None':
            print("no loss for vit_face")

    def preprocess(self, img):
        x = self.soft_split(img).transpose(1, 2)
        x = self.patch_to_embedding(x)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)
        return x

    def forward(self, img, label=None , mask=None, return_lhs=False):
        x = self.preprocess(img)
        x = self.transformer(x, mask)

        last_hidden_state = x.detach()
        # print('transformer_out', x.shape)

        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        emb = self.mlp_head(x)
        if label is not None:
            x = self.loss(emb, label)
            return x, emb
        elif return_lhs:
            return emb, last_hidden_state
        else:
            return emb

In [9]:
class SiameseNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder1 = self.build_encoder()
        self.encoder2 = self.build_encoder()
        self.sigmoid = nn.Sigmoid()
        self.embed_size = 512
        self.fc = nn.Sequential(
            nn.Linear(self.embed_size*3, self.embed_size*4),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(self.embed_size*4, self.embed_size*1),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(self.embed_size*1, self.embed_size//4),
            nn.ReLU(),
        )
        self.last = nn.Sequential(
            nn.Linear(self.embed_size//4+1,self.embed_size//16),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(self.embed_size//16, 1),
        )

    def build_encoder(self):
        model = ViTs_face(
            loss_type=None,
            GPU_ID=Config.device,
            num_class=93431,
            image_size=112,
            patch_size=8,
            ac_patch_size=12,
            pad=4,
            dim=512,
            depth=20,
            heads=8,
            mlp_dim=2048,
            dropout=0.1,
            emb_dropout=0.1
        )
        model.load_state_dict(torch.load(Config.pretrained_vits, map_location=Config.device), strict=False)
        for name, layer in model.named_parameters():
            if 'linear_proj' in name or '19' in name:
                # print(name)
                layer.requires_grad_(True)
            else:
                layer.requires_grad_(False)
        return model

    def forward(self, img1, img2, mask=None):
        x1 = self.encoder1.preprocess(img1)
        x2 = self.encoder2.preprocess(img2)

        for layer1, layer2 in zip(self.encoder1.transformer.layers, self.encoder2.transformer.layers):
            x1_new = layer1[0](x1, x2=x2, mask=mask)
            x1_new = layer1[1](x1_new)
            x2_new = layer2[0](x2, x2=x1, mask=mask)
            x2_new = layer2[1](x2_new)

        x1_new = x1_new.mean(dim = 1) if self.encoder1.pool == 'mean' else x1_new[:, 0]
        x2_new = x2_new.mean(dim = 1) if self.encoder2.pool == 'mean' else x2_new[:, 0]

        x1_new = self.encoder1.to_latent(x1_new)
        x2_new = self.encoder2.to_latent(x2_new)
        x1 = self.encoder1.mlp_head(x1_new)
        x2 = self.encoder2.mlp_head(x2_new)

        x3 = x1-x2
        x5 = torch.pow(x1,2)
        x6 = torch.pow(x2,2)
        x = torch.cat([x3,x5+x6,x1*x2],dim=-1)
        x = self.fc(x)
        cos_dis=1-F.cosine_similarity(x1,x2,dim=-1)
        x = torch.cat([x,cos_dis.unsqueeze(1)],dim=-1)
        result = self.last(x)
        result=torch.sigmoid(result)
        return result

### Train

In [None]:
def train(net, criterion, optimizer):
    net.train()
    train_loss = 0.0
    running_loss = 0.0
    running_corrects = 0

    for i, batch in enumerate(trainloader):
        optimizer.zero_grad()

        img1, img2, label = batch
        img1, img2, label = img1.to(Config.device), img2.to(Config.device), label.float().view(-1,1).to(Config.device)
        output = net(img1, img2)
        preds = output>0.5

        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        running_loss += loss.item()
        running_corrects += torch.sum(preds == (label>0.5))

        step = 100
        if i % step == step-1:
            print(' [{} - {:.2f}%],\ttrain loss: {:.5}'.format(epoch+1, 100*(i+1)/len(trainloader), running_loss/step/200))
            running_loss = 0

    train_loss /= len(trainset)
    running_corrects = running_corrects.item()/len(trainset)
    print('[{}], \ttrain loss: {:.5}\tacc: {:.5}'.format(epoch+1, train_loss, running_corrects))
    return train_loss, running_corrects

In [None]:
def validate(net, criterion, optimizer):
    net.eval()
    val_loss = 0.0
    running_corrects = 0

    for batch in valloader:
        img1, img2, label = batch
        img1, img2, label = img1.to(Config.device), img2.to(Config.device), label.float().view(-1,1).to(Config.device)
        with torch.no_grad():
            output = net(img1, img2)
            preds = output>0.5
            loss = criterion(output, label)

        val_loss += loss.item()
        running_corrects += torch.sum(preds == (label>0.5))

    val_loss /= len(valset)
    running_corrects = running_corrects.item()/len(valset)
    print('[{}], \tval loss: {:.5}\tacc: {:.5}'.format(epoch+1, val_loss, running_corrects))

    return val_loss, running_corrects

In [None]:
print("Initialize network...")
net = SiameseNet().to(Config.device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=Config.learning_rate)
scheduler = ReduceLROnPlateau(optimizer, patience=10)

In [None]:
print("Start training...")

best_val_loss = 1000
best_val_acc = 0.0
best_epoch = 0

history = []
accuracy = []
for epoch in range(Config.number_of_epochs):
    train_loss, train_acc = train(net, criterion, optimizer)
    val_loss, val_acc = validate(net, criterion, optimizer)
    history.append((train_loss, val_loss))
    accuracy.append((train_acc,val_acc))
    scheduler.step(val_loss)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(net.state_dict(), './checkpoints/best_loss_inno2.pth')
        print('saving...')
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(net.state_dict(), './checkpoints/best_acc_inno2.pth')
        print('saving...')

Start training...
[1], 	train loss: 0.010911	acc: 0.49608
[1], 	val loss: 0.011739	acc: 0.5
saving...
saving...
[2], 	train loss: 0.01088	acc: 0.50766
[2], 	val loss: 0.011712	acc: 0.5
saving...
[3], 	train loss: 0.010882	acc: 0.50341
[3], 	val loss: 0.011723	acc: 0.5
[4], 	train loss: 0.010875	acc: 0.5046
[4], 	val loss: 0.011703	acc: 0.5
saving...
[5], 	train loss: 0.010869	acc: 0.50426
[5], 	val loss: 0.011718	acc: 0.5
[6], 	train loss: 0.010867	acc: 0.50221
[6], 	val loss: 0.01165	acc: 0.51689
saving...
saving...
[7], 	train loss: 0.01085	acc: 0.51822
[7], 	val loss: 0.01165	acc: 0.52872
saving...
[8], 	train loss: 0.010831	acc: 0.51907
[8], 	val loss: 0.011259	acc: 0.53547
saving...
saving...
[9], 	train loss: 0.010841	acc: 0.51141
[9], 	val loss: 0.011707	acc: 0.5
[10], 	train loss: 0.010853	acc: 0.50715
[10], 	val loss: 0.011446	acc: 0.54054
saving...
[11], 	train loss: 0.010854	acc: 0.50596
[11], 	val loss: 0.011701	acc: 0.49831
[12], 	train loss: 0.010858	acc: 0.50647
[12], 	v

### Submission

In [10]:
class FamilyTestDataset(Dataset):
    def __init__(self, relations, data_dir, transform):
        """
        Args:
            relations (string): Data frame with the image paths.
            data_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.relations = relations
        self.data_dir = data_dir
        self.transform = transform

    def __len__(self) -> int:
        return len(self.relations)

    def __getpair__(self, idx):
        pair = (
            os.path.join(self.data_dir, self.relations.iloc[idx, 0].split("-")[0]),
            os.path.join(self.data_dir, self.relations.iloc[idx, 0].split("-")[1]),
        )
        return pair

    def __getlabel__(self, idx) -> int:
        return self.relations.iloc[idx, 1]

    def __getitem__(self, idx):
        pair = self.__getpair__(idx)

        im1 = Image.open(pair[0])
        im2 = Image.open(pair[1])

        img1 = self.transform(im1)
        img2 = self.transform(im2)

        return idx, img1, img2
def create_test_dataloader(test_image_dir: str, test_relationship_file: str):
    df = pd.read_csv(test_relationship_file)

    transform = transforms.Compose([
        transforms.Resize(112),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0., 0., 0.],
                             std=[1/255., 1/255., 1/255.])
    ])

    test_dataset = FamilyTestDataset(
        relations=df, data_dir=test_image_dir, transform=transform
    )

    test_loader = DataLoader(
        test_dataset,
        shuffle=True,
        batch_size=200,
    )

    return test_loader

In [15]:
def load_classifier(path_to_model_weights: str):
    model = SiameseNet()
    model.load_state_dict(torch.load(path_to_model_weights))
    return model

def create_submission(path_to_template: str, path_to_save: str, predictions):
    template = pd.read_csv(path_to_template)

    # Remember to save as floats as metric is AUC
    for row, pred in predictions.items():
        template.loc[row, "is_related"] = float(pred)

    template.to_csv(path_or_buf=path_to_save, index=False)
    return


def test_classifier(classifier, test_loader):
    predictions = {}

    classifier.to(Config.device)
    classifier.eval()
    for i, data in enumerate(test_loader):
        row, img1, img2 = data
        row, img1, img2 = row.to(Config.device), img1.to(Config.device), img2.to(Config.device)

        with torch.no_grad():
            output = classifier(img1, img2)

        for j in range(len(row)):
            predictions[row[j].item()] = output[j].item()

    return predictions


if __name__ == "__main__":
    path_to_model_weights = "./checkpoints/best_loss_inno.pth"
    path_to_template = "./data/submissions/sample_submission.csv"
    path_to_save = "./data/submissions/loss_inno.csv"

    classifier = load_classifier(path_to_model_weights)

    test_loader = create_test_dataloader(
        "/content/data/test",
        "/content/data/submissions/sample_submission.csv"
    )

    predictions = test_classifier(
        classifier=classifier, test_loader=test_loader
    )
    print(len(predictions))

    create_submission(
        path_to_template=path_to_template,
        path_to_save=path_to_save,
        predictions=predictions,
    )

5310
