# **EfficientNet-B3 Siamese-Network Thresholds**
* [Imports](#section-one)
* [Load model](#section-two)
* [Check thresholds](#section-three)
    - [IDs](#sub-section-three-one)
    - [Driving Licenses](#sub-section-three-two)
    - [Passports](#sub-section-three-three)

<a id="section-one"></a>
## **Imports**

In [22]:
import torch
import torchvision
from torch import nn
from efficientnet_pytorch import EfficientNet
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import random
import cv2
import albumentations as A
from albumentations.pytorch import ToTensorV2
import time
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from tqdm.notebook import tqdm_notebook
from metrics import plot, get_metrics, get_best_thresholds, validate_thresh

%matplotlib inline

In [23]:
ID = 'id'
DRIVING_LICENSE = 'driving_license'
PASSPORT = 'passport'

In [24]:
img_size = 200
n_epochs = 20
test_size = 0.2
BATCH_SIZE = 8

device = torch.device("cpu")

<a id="section-two"></a>
## **Load model**

In [25]:
def get_transforms(apply_augmentations=True):
    if apply_augmentations:
        return A.Compose([
            A.Rotate(limit=20),
            A.Flip(),
            A.OneOf([
                A.HueSaturationValue(), 
                A.RandomBrightnessContrast(),
            ], p=0.4),
            A.OneOf([
                A.Blur(blur_limit=3),
                A.MedianBlur(blur_limit=3),
                A.GaussNoise()
                ],p=0.4),
            A.Normalize(p=1.0),
            A.Resize(height=img_size, width=img_size, p=1),
            ToTensorV2(p=1.0),
        ], p=1.0)

    else:
        return A.Compose([
            A.Resize(height=img_size, width=img_size, p=1),
            A.Normalize(p=1.0),
            ToTensorV2(p=1.0),
        ])

In [26]:
class TripletsDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df
    
    def __get_image(self, image_path, aug_prob=0.7):
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        prob = random.uniform(0.0, 1.0)
        transforms = get_transforms(True if prob<=aug_prob else False)
        transformed = transforms(image=image)
        image = transformed['image']
        
        return image
            
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        anchor_path = row['anchor']
        positive_path = row['positive']
        negative_path = row['negative']
        
        return self.__get_image(anchor_path), self.__get_image(positive_path), self.__get_image(negative_path)
    
    def __len__(self):
        return len(self.df)

In [27]:
def data_loader(dataset, train):
    if train:
        sampler = RandomSampler(dataset)
    else:
        sampler = SequentialSampler(dataset)
    return torch.utils.data.DataLoader(dataset,
                                       batch_size=BATCH_SIZE,
                                       sampler=sampler,
                                       #pin_memory=False,
                                       #drop_last=True,
                                       shuffle=False)

In [28]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [29]:
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        
        self.base_cnn = EfficientNet.from_pretrained('efficientnet-b3') # 1536
        self.fc = nn.Sequential(nn.Linear(1536 , 512),
                                nn.BatchNorm1d(512),
                                nn.Dropout(0.3),
                                nn.Linear(512 , 256))
    
    def _get_vector(self, x):
        x = self.base_cnn.extract_features(x)
        x = self.base_cnn._avg_pooling(x)
        x = x.flatten(start_dim=1)
        x = self.base_cnn._dropout(x)
        x = self.fc(x)

        return x

    def forward(self, anchor, positive, negative):
        return self._get_vector(anchor), self._get_vector(positive), self._get_vector(negative)

In [30]:
class SiameseModel:
    def __init__(self, verbose=True):
        self.model = SiameseNetwork().to(device)
        
    def compile(self, loss_fn, path=None):
        self.loss_fn = loss_fn
        if path is not None:
            self.load(path)
            
    # load checkpoint from path
    def load(self, path):
        map_location = device.type if device.type=='cpu' else None
        self.model.load_state_dict(torch.load(path, map_location=map_location)['model_state_dict'])
        self.model.eval()
    
    
    def get_vector(self, image, dim=1):        
        vector = self.model._get_vector(image.to(device).float())
        if dim==1:
            return vector[0].cpu()
        
        return vector.cpu()
    
    def get_loss(self, input_data, true_data):
        with torch.no_grad():
            input_vector = self.get_vector(input_data, dim=None)
            true_vector = self.get_vector(true_data, dim=None)

            loss = self.loss_fn(input_vector.to(device), true_vector.to(device))
            
        return loss.detach().item()
    
    def is_match(self, input_doc, doc_vector):
        if not torch.is_tensor(doc_vector):
            if isinstance(doc_vector, list):
                doc_vector = np.array(doc_vector)
            assert isinstance(doc_vector, np.ndarray), f'Expected vector input to be of a list/numpy array/torch tensor, but got {type(doc_vector)}.'
            doc_vector = torch.as_tensor(doc_vector)
        
        with torch.no_grad():
            input_vector = self.get_vector(input_doc, dim=None)

            loss = self.loss_fn(input_vector.to(device), doc_vector.to(device)).detach().item()
        return loss   
       # if loss <= THRESHOLD:
       #     return True
        #return False

In [31]:
doc_siamese_model = SiameseModel()
loss_fn = nn.PairwiseDistance()
model_path = r'C:\Users\User\Desktop\Final project\SecureVotingSystem\src\static_files\models\siamese_model.bin'
doc_siamese_model.compile(loss_fn, path=model_path)

Loaded pretrained weights for efficientnet-b3


<a id="section-three"></a>
## **Check thresholds**

In [38]:
true_data_dir = r'C:\Users\User\Desktop\Final project\doc verification\test docs'
dest_dir = r'D:\computer vision and deep learning\final project\doc generation\doc_siamese_thresh'

In [14]:
def get_image(image_path, apply_transforms=True):
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    assert image is not None, f'Document not found in path "{image_path}".'
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
    transforms = get_transforms(apply_transforms)
    transformed = transforms(image=image)
    image = transformed['image']
    
    if len(image.shape)==2:
            image = image[None, :, :]
    if len(image.shape)==3:
        image = image[None, :, :, :]

    assert len(image.shape)==4, f'Expected input data of 4 dimensions, but got {len(image.shape)}.'
    return image
    
def get_distances(test_docs_dir, doc_type):
    distances = []
    docs = ['id', 'dl', 'ps']
    assert doc_type in docs, f'Expected doc_type to be one of [id, dl, ps] but got {doc_type}.'
    
    for dir_name in tqdm_notebook(os.listdir(test_docs_dir)):
        dir_path = os.path.join(test_docs_dir, dir_name)
        docs = os.listdir(dir_path)
        
        if doc_type+'.jpg' in docs:
            anchor = get_image(os.path.join(dir_path, doc_type+'.jpg'), False)
            positives = [os.path.join(os.path.join(test_docs_dir, dir_path), other) for other in os.listdir(dir_path) if other.startswith(doc_type+'_')]
            negatives = [os.path.join(os.path.join(test_docs_dir, dir_path), other) for other in os.listdir(dir_path) if not other.startswith(doc_type)]
            other_dir = random.choice(os.listdir(test_docs_dir))
            while other_dir!=dir_name:
                other_dir = random.choice(os.listdir(test_docs_dir))
            other_negatives = [os.path.join(os.path.join(test_docs_dir, other_dir), other) for other in os.listdir(os.path.join(test_docs_dir, other_dir)) if other.startswith(doc_type)]
            while len(other_negatives)==0:
                other_dir = random.choice(os.listdir(test_docs_dir))
                while other_dir!=dir_name:
                    other_dir = random.choice(os.listdir(test_docs_dir))
                other_negatives = [os.path.join(os.path.join(test_docs_dir, other_dir), other) for other in os.listdir(os.path.join(test_docs_dir, other_dir)) if other.startswith(doc_type)]

            for positive in positives:
                true_dist = doc_siamese_model.get_loss(anchor, get_image(positive, False))
                distances.append({'dist':true_dist, 'match':True})

            for negative in negatives:
                false_dist = doc_siamese_model.get_loss(anchor, get_image(negative, False))
                distances.append({'dist':false_dist, 'match':False})

            for negative in other_negatives:
                false_dist = doc_siamese_model.get_loss(anchor, get_image(negative, False))
                distances.append({'dist':false_dist, 'match':False})
    
    return distances

<a id="sub-section-three-one"></a>
### **IDs**

In [15]:
id_distances = get_distances(true_data_dir, 'id')

  0%|          | 0/11 [00:00<?, ?it/s]

In [16]:
id_distances_df = pd.DataFrame(id_distances)
id_distances_df

Unnamed: 0,dist,match
0,18.025276,True
1,0.000016,False
2,18.025276,False
3,16.114798,True
4,14.629186,True
...,...,...
123,21.175018,False
124,17.857195,False
125,25.386393,False
126,26.401579,False


In [89]:
id_thresholds = pd.read_csv(os.path.join(os.path.join(dest_dir, 'test_data'), 'id_best_thresholds.csv'))
thresholds = []
for index, row in id_thresholds.iterrows():
    thresholds.append({'dist':row['dist_thresh']})

In [91]:
true_match = id_distances_df[id_distances_df['match']==True]
false_match = id_distances_df[id_distances_df['match']==False]
id_thresholds = validate_thresh(true_match.drop(columns='match'), false_match.drop(columns='match'), thresholds)
id_thresholds

Unnamed: 0,dist,tps%,fns%,tns%,fps%,accuracy,f1_score,precision,recall
0,13.530822,26.923077,73.076923,81.372549,18.627451,0.703125,0.269231,0.269231,0.269231
1,17.278098,53.846154,46.153846,69.607843,30.392157,0.664062,0.394366,0.311111,0.538462
2,21.025374,65.384615,34.615385,54.901961,45.098039,0.570312,0.382022,0.269841,0.653846
3,9.783546,3.846154,96.153846,90.196078,9.803922,0.726562,0.054054,0.090909,0.038462
4,24.77265,88.461538,11.538462,33.333333,66.666667,0.445312,0.393162,0.252747,0.884615


In [92]:
id_distances_df.to_csv(os.path.join(os.path.join(dest_dir, 'true_data'), 'id_distances.csv'), index=False)
id_thresholds.to_csv(os.path.join(os.path.join(dest_dir, 'true_data'), 'id_thresholds.csv'), index=False)

<a id="sub-section-three-two"></a>

### **Driving Licenses**

In [93]:
driving_license_distances = get_distances(true_data_dir, 'dl')

  0%|          | 0/11 [00:00<?, ?it/s]

In [94]:
driving_license_distances_df = pd.DataFrame(driving_license_distances)
driving_license_distances_df

Unnamed: 0,dist,match
0,13.245131,True
1,27.404432,False
2,29.820847,False
3,31.604755,False
4,0.000016,False
...,...,...
139,0.000016,False
140,25.750244,False
141,25.505674,False
142,23.378250,False


In [96]:
driving_license_thresholds = pd.read_csv(os.path.join(os.path.join(dest_dir, 'test_data'), 'driving_license_best_thresholds.csv'))
thresholds = []
for index, row in driving_license_thresholds.iterrows():
    thresholds.append({'dist':row['dist_thresh']})

In [97]:
true_match = driving_license_distances_df[driving_license_distances_df['match']==True]
false_match = driving_license_distances_df[driving_license_distances_df['match']==False]
driving_license_thresholds = validate_thresh(true_match.drop(columns='match'), false_match.drop(columns='match'), thresholds)
driving_license_thresholds

Unnamed: 0,dist,tps%,fns%,tns%,fps%,accuracy,f1_score,precision,recall
0,12.742621,23.076923,76.923077,85.59322,14.40678,0.743056,0.244898,0.26087,0.230769
1,16.223034,34.615385,65.384615,78.813559,21.186441,0.708333,0.3,0.264706,0.346154
2,19.703447,46.153846,53.846154,73.728814,26.271186,0.6875,0.347826,0.27907,0.461538
3,9.262208,3.846154,96.153846,91.525424,8.474576,0.756944,0.054054,0.090909,0.038462
4,23.18386,53.846154,46.153846,65.254237,34.745763,0.631944,0.345679,0.254545,0.538462


In [98]:
driving_license_distances_df.to_csv(os.path.join(os.path.join(dest_dir, 'true_data'), 'driving_license_distances.csv'), index=False)
driving_license_thresholds.to_csv(os.path.join(os.path.join(dest_dir, 'true_data'), 'driving_license_thresholds.csv'), index=False)

<a id="sub-section-three-three"></a>
## **Passports**

In [99]:
passport_distances = get_distances(true_data_dir, 'ps')

  0%|          | 0/11 [00:00<?, ?it/s]

In [101]:
passport_distances_df = pd.DataFrame(passport_distances)
passport_distances_df

Unnamed: 0,dist,match
0,7.115206,True
1,26.980181,True
2,27.404436,False
3,22.193275,False
4,0.000016,False
...,...,...
116,18.008844,False
117,15.963090,False
118,18.372381,False
119,19.420849,False


In [102]:
passport_thresholds = pd.read_csv(os.path.join(os.path.join(dest_dir, 'test_data'), 'passport_best_thresholds.csv'))
thresholds = []
for index, row in passport_thresholds.iterrows():
    thresholds.append({'dist':row['dist_thresh']})

In [103]:
true_match = passport_distances_df[passport_distances_df['match']==True]
false_match = passport_distances_df[passport_distances_df['match']==False]
passport_thresholds = validate_thresh(true_match.drop(columns='match'), false_match.drop(columns='match'), thresholds)
passport_thresholds

Unnamed: 0,dist,tps%,fns%,tns%,fps%,accuracy,f1_score,precision,recall
0,12.369255,16.666667,83.333333,87.912088,12.087912,0.702479,0.217391,0.3125,0.166667
1,15.672237,33.333333,66.666667,81.318681,18.681319,0.694215,0.350877,0.37037,0.333333
2,18.975218,60.0,40.0,63.736264,36.263736,0.628099,0.444444,0.352941,0.6
3,9.066273,13.333333,86.666667,89.010989,10.989011,0.702479,0.181818,0.285714,0.133333
4,22.2782,73.333333,26.666667,48.351648,51.648352,0.545455,0.444444,0.318841,0.733333


In [104]:
passport_distances_df.to_csv(os.path.join(os.path.join(dest_dir, 'true_data'), 'passport_distances.csv'), index=False)
passport_thresholds.to_csv(os.path.join(os.path.join(dest_dir, 'true_data'), 'passport_thresholds.csv'), index=False)