In [15]:
MULTIPLIER = 100
MARGIN = MULTIPLIER/5
ALPHA = 0.05

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
import pickle
import pandas as pd
from utilities import CustomDataset, trainLoop, testLoop, save_stats

In [17]:
class SiameseAutoEncoder(nn.Module):
    def __init__(self):
        super(SiameseAutoEncoder, self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(768, 384),
            nn.ReLU(True),
            nn.Linear(384, 192),
            nn.ReLU(True),
            nn.Linear(192, 96),
            nn.ReLU(True),
            nn.Linear(96, 48)
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(48, 96),
            nn.ReLU(True),
            nn.Linear(96, 192),
            nn.ReLU(True),
            nn.Linear(192, 384),
            nn.ReLU(True),
            nn.Linear(384, 768)
        )

    def forward(self, x1, x2):
        latent1 = self.encoder(x1)
        reconstructed1 = self.decoder(latent1)
        latent2 = self.encoder(x2)
        reconstructed2 = self.decoder(latent2)

        reconstruct_distance1 = torch.norm(x1-reconstructed1, dim=1)
        reconstruct_distance2 = torch.norm(x2-reconstructed2, dim=1)
        latend_distance = torch.norm(latent1-latent2, dim=1)
        return reconstruct_distance1, reconstruct_distance2, latend_distance

In [18]:
class HibridLoss(nn.Module):
    def __init__(self, alpha, margin):
        super(HibridLoss, self).__init__()
        self.alpha = alpha
        self.margin = margin

    def forward(self, outputs, label):
        reconstruct_distance1, reconstruct_distance2, latend_distance = outputs
        loss_reconstruction = self.alpha * ((reconstruct_distance1**2)+(reconstruct_distance2**2))

        loss_1 = label * (latend_distance**2)
        loss_2 = (1-label)*(torch.max(self.margin-latend_distance, torch.zeros_like(latend_distance))**2)

        loss_hibrid = loss_reconstruction + loss_1 + loss_2
        return torch.mean(loss_hibrid)

In [19]:
SAVES_FOLDER = "saves/"
df = pd.read_csv(SAVES_FOLDER + "dataset.csv")

with open(SAVES_FOLDER + 'id2embedding.pkl', 'rb') as f:
    id2embedding = pickle.load(f)

In [20]:
features_cols = ["left_spec_id", "right_spec_id"]
target_col = "label"

dataset = CustomDataset(df, features_cols, target_col, id2embedding, MULTIPLIER, True)

train_size = int(0.75 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

autoencoder = SiameseAutoEncoder()

criterion = HibridLoss(alpha=ALPHA, margin=MARGIN)
optimizer = optim.Adam(autoencoder.parameters(), lr=0.00008)

In [21]:
NUM_EPOCHS = 100
stats = dict()

for epoch in range(NUM_EPOCHS):
    loss = trainLoop(autoencoder, optimizer, criterion, train_loader)
    print(f"Epoch {epoch + 1}, Loss: {loss}")
    pred_function = lambda x: x[2] < MARGIN
    testLoop(autoencoder, criterion, test_loader, pred_function, stats)

save_stats("AutoEncoder_V1_bert_base", stats)

Epoch 1, Loss: 500.19938771565756
Test Loss: 220.5566, Test Accuracy: 0.4738
Precision: 0.4105, Recall: 0.9463, F1-score: 0.5726
TP: 282, FP: 405, TN: 97, FN: 16
Epoch 2, Loss: 205.0536148325602
Test Loss: 195.3644, Test Accuracy: 0.5038
Precision: 0.4267, Recall: 0.9664, F1-score: 0.5920
TP: 288, FP: 387, TN: 115, FN: 10
Epoch 3, Loss: 177.2160090382894
Test Loss: 176.3970, Test Accuracy: 0.5262
Precision: 0.4370, Recall: 0.9430, F1-score: 0.5972
TP: 281, FP: 362, TN: 140, FN: 17
Epoch 4, Loss: 157.28212776184083
Test Loss: 161.2208, Test Accuracy: 0.4988
Precision: 0.4257, Recall: 0.9899, F1-score: 0.5954
TP: 295, FP: 398, TN: 104, FN: 3
Epoch 5, Loss: 148.0189002863566
Test Loss: 157.4114, Test Accuracy: 0.5425
Precision: 0.4474, Recall: 0.9698, F1-score: 0.6123
TP: 289, FP: 357, TN: 145, FN: 9
Epoch 6, Loss: 142.15649202982584
Test Loss: 151.0299, Test Accuracy: 0.5312
Precision: 0.4423, Recall: 0.9899, F1-score: 0.6114
TP: 295, FP: 372, TN: 130, FN: 3
Epoch 7, Loss: 136.7199174753

In [22]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return torch.sigmoid(x)

In [23]:
class EncoderOnlyMLP(nn.Module):
    def __init__(self, encoder, mlp):
        super(EncoderOnlyMLP, self).__init__()
        self.encoder = encoder
        self.mlp = mlp

        for param in self.encoder.parameters():
            param.requires_grad = False

    def forward(self, x1, x2):
        x1 = self.encoder(x1)
        x2 = self.encoder(x2)
        diff = torch.abs(x1 - x2)
        x = self.mlp(diff)
        return x.squeeze()

In [24]:
class ClassificationLoss(nn.Module):
    def __init__(self):
        super(ClassificationLoss, self).__init__()

    def forward(self, outputs, label):
        loss_1 = label * torch.log(outputs)
        loss_2 = (1-label) * torch.log(1 - outputs)
        return torch.mean(-(loss_1 + loss_2))

In [25]:
mlp = MLP(48, 256, 1)
encoder_only_mlp = EncoderOnlyMLP(autoencoder.encoder, mlp)
criterion = ClassificationLoss()
optimizer = optim.SGD(encoder_only_mlp.parameters(), lr=0.00025)

In [26]:
NUM_EPOCHS = 100
stats = dict()
for epoch in range(NUM_EPOCHS):
    loss = trainLoop(encoder_only_mlp, optimizer, criterion, train_loader)
    print(f"Epoch {epoch + 1}, Loss: {loss}")
    pred_function = lambda x: x > 0.5
    testLoop(encoder_only_mlp, criterion, test_loader, pred_function, stats)

Epoch 1, Loss: 0.35346180215477946
Test Loss: 0.3686, Test Accuracy: 0.6275
Precision: 0.0000, Recall: 0.0000, F1-score: 0.0000
TP: 0, FP: 0, TN: 502, FN: 298
Epoch 2, Loss: 0.29436530464639266
Test Loss: 0.3657, Test Accuracy: 0.6275
Precision: 0.0000, Recall: 0.0000, F1-score: 0.0000
TP: 0, FP: 0, TN: 502, FN: 298
Epoch 3, Loss: 0.2873201160815855
Test Loss: 0.3616, Test Accuracy: 0.6275
Precision: 0.0000, Recall: 0.0000, F1-score: 0.0000
TP: 0, FP: 0, TN: 502, FN: 298
Epoch 4, Loss: 0.28163700498019656
Test Loss: 0.3566, Test Accuracy: 0.6362
Precision: 1.0000, Recall: 0.0235, F1-score: 0.0459
TP: 7, FP: 0, TN: 502, FN: 291
Epoch 5, Loss: 0.2760693063400686
Test Loss: 0.3516, Test Accuracy: 0.6450
Precision: 1.0000, Recall: 0.0470, F1-score: 0.0897
TP: 14, FP: 0, TN: 502, FN: 284
Epoch 6, Loss: 0.27077720352758966
Test Loss: 0.3464, Test Accuracy: 0.6538
Precision: 1.0000, Recall: 0.0705, F1-score: 0.1317
TP: 21, FP: 0, TN: 502, FN: 277
Epoch 7, Loss: 0.26564648034982385
Test Loss: 

In [27]:
save_stats("AutoEncoder_V2_bert_base", stats)