In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModel
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

SEED = 42

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Load fine-tuned embeddings

In [50]:
df = pd.read_csv('data/hackaton_result_dataset.csv', encoding='windows-1251')
val_df = pd.read_csv('data/validation-dataset.csv', encoding='windows-1251')
val_df = val_df.head(5000)
test_df = pd.read_csv('data/validation-dataset.csv', encoding='windows-1251')

In [51]:
BASE_MODEL = 'ai-forever/sbert_large_nlu_ru'
TUNED_MODEL = 'models/sbert-v2/checkpoint-814'


tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
embed_model = AutoModel.from_pretrained(TUNED_MODEL)
embed_model.to(device)

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [52]:
texts = df['model_annotation'].to_list()
X_train = np.array([embed_bert_cls(text, embed_model, tokenizer) for text in texts])

val_texts = val_df['annotation_fastconformer'].to_list()
X_val = np.array([embed_bert_cls(text, embed_model, tokenizer) for text in val_texts])

test_texts = test_df['annotation_fastconformer'].to_list()
X_test = np.array([embed_bert_cls(text, embed_model, tokenizer) for text in test_texts])

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


(6508, 1024)
(5000, 1024)
(10000, 1024)


In [53]:
np.save('data/ft-embed/train-embeddings.npy', X_train)
np.save('data/ft-embed/validation-embeddings.npy', X_val)
np.save('data/ft-embed/test-embeddings.npy', X_test)

### load general embeddings

In [101]:
df = pd.read_csv('data/hackaton_result_dataset.csv', encoding='windows-1251')
val_df = pd.read_csv('data/validation-dataset.csv', encoding='windows-1251')
val_df = val_df.head(5000)
test_df = pd.read_csv('data/validation-dataset.csv', encoding='windows-1251')

In [102]:
BASE_MODEL = 'ai-forever/sbert_large_nlu_ru'

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
embed_model = AutoModel.from_pretrained(BASE_MODEL)
embed_model.to(device)

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [103]:
texts = df['model_annotation'].to_list()
X_train = np.array([embed_bert_cls(text, embed_model, tokenizer) for text in texts])

val_texts = val_df['annotation_fastconformer'].to_list()
X_val = np.array([embed_bert_cls(text, embed_model, tokenizer) for text in val_texts])

test_texts = test_df['annotation_fastconformer'].to_list()
X_test = np.array([embed_bert_cls(text, embed_model, tokenizer) for text in test_texts])

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


(6508, 1024)
(5000, 1024)
(10000, 1024)


In [105]:
np.save('data/gen-embed/train-embeddings.npy', X_train)
np.save('data/gen-embed/validation-embeddings.npy', X_val)
np.save('data/gen-embed/test-embeddings.npy', X_test)

### general embeddings + catboost

In [2]:
df = pd.read_csv('data/hackaton_result_dataset.csv', encoding='windows-1251')
val_df = pd.read_csv('data/validation-dataset.csv', encoding='windows-1251')
val_df = val_df.head(5000)

In [3]:
X_train = np.load('data/gen-embed/train-embeddings.npy')
y_train = df['label'].to_list()

X_val = np.load('data/gen-embed/validation-embeddings.npy')
y_val = val_df['label'].to_list()

print(X_train.shape)
print(X_val.shape)

(6508, 1024)
(5000, 1024)


In [4]:
model = CatBoostClassifier(iterations=1000, learning_rate=0.05, loss_function='Logloss',
                           custom_metric=['AUC'], depth=4, l2_leaf_reg=10, 
                           random_seed=SEED, task_type="GPU", devices='0')

model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    verbose=False,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x7f35aa249b90>

evaluation

In [5]:
import evaluate

test_df = pd.read_csv('data/validation-dataset.csv', encoding='windows-1251')

X_test = np.load('data/gen-embed/test-embeddings.npy')
y_test = test_df['label'].to_list()

preds = model.predict(X_test)

metric = evaluate.load('accuracy')
metric.compute(references=y_test, predictions=preds)

{'accuracy': 0.7075}

### fine-tuned embeddings + catboost

In [7]:
df = pd.read_csv('data/hackaton_result_dataset.csv', encoding='windows-1251')
val_df = pd.read_csv('data/validation-dataset.csv', encoding='windows-1251')
val_df = val_df.head(5000)

In [8]:
texts = df['model_annotation'].to_list()
X_train = np.load('data/ft-embed/train-embeddings.npy')
y_train = df['label'].to_list()

val_texts = val_df['annotation_fastconformer'].to_list()
X_val = np.load('data/ft-embed/validation-embeddings.npy')
y_val = val_df['label'].to_list()

print(X_train.shape)
print(X_val.shape)

(6508, 1024)
(5000, 1024)


In [17]:
model = CatBoostClassifier(iterations=1000, learning_rate=0.002, loss_function='Logloss',
                           custom_metric=['AUC'], depth=4, l2_leaf_reg=20,
                           random_seed=SEED, task_type="GPU", devices='0')

model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    verbose=False,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x7f35ab4cca90>

evaluation

In [18]:
import evaluate

test_df = pd.read_csv('data/validation-dataset.csv', encoding='windows-1251')

X_test = np.load('data/ft-embed/test-embeddings.npy')
y_test = test_df['label'].to_list()

preds = model.predict(X_test)

metric = evaluate.load('accuracy')
metric.compute(references=y_test, predictions=preds)

{'accuracy': 0.7457}

### CNN

In [95]:
df = pd.read_csv('data/hackaton_result_dataset.csv', encoding='windows-1251')
val_df = pd.read_csv('data/validation-dataset.csv', encoding='windows-1251')
val_df = val_df.head(5000)

In [96]:
X_train = np.load('data/ft-embed/train-embeddings.npy')
y_train = df['label'].to_list()

X_val = np.load('data/ft-embed/validation-embeddings.npy')
y_val = val_df['label'].to_list()

print(X_train.shape)
print(X_val.shape)

(6508, 1024)
(5000, 1024)


In [97]:
class MyDataset (torch.utils.data.Dataset):
    def __init__ (self, X, y):
        self.X = torch.Tensor(X.reshape((-1, 32, 32))[:, np.newaxis, :, :])
        self.y = torch.Tensor(y).to(torch.int8).tolist()
    
    def __len__(self):
        return self.X.shape[0]
    def __getitem__ (self, index):
        return (self.X[index], self.y[index])

In [98]:
train_set = MyDataset(X_train, y_train)
val_set = MyDataset(X_val, y_val)

batch_size = 16
num_workers = 4
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size,
                                           shuffle=True, num_workers=num_workers)

val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size,
                                         shuffle=True, num_workers=num_workers)

In [99]:
class ConvNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # 32 x 32 x 1
        self.conv1 = torch.nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, 
                                     stride=1, padding='same')
        # 32 x 32 x 16
        self.pool1 = torch.nn.MaxPool2d(kernel_size=(2, 2), stride=None, padding=0)
        # 16 x 16 x 16
        self.conv2 = torch.nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3,
                                     stride=1, padding='same')
        # 16 x 16 x 32
        self.pool2 = torch.nn.MaxPool2d(kernel_size=(2, 2), stride=None, padding=0)
        # 8 x 8 x 32
        self.conv3 = torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3,
                                     stride=1, padding='same')
        # 8 x 8 x 64
        self.pool3 = torch.nn.MaxPool2d(kernel_size=(2, 2), stride=None, padding=0)
        # 4 x 4 x 64
        self.fc = torch.nn.Linear(64 * 4 * 4, 2)

    def forward(self, x):
        # x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool1(self.conv1(x))
        x = self.pool2(self.conv2(x))
        x = self.pool3(self.conv3(x))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        # x = F.relu(self.fc(x))
        x = self.fc(x)
        return x

In [100]:
from torchvision.models import efficientnet_v2_s, EfficientNet_V2_S_Weights

class EfficientNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # 32 x 32 x 1
        self.conv1 = torch.nn.Conv2d(in_channels=1, out_channels=3, kernel_size=3, 
                                     stride=1, padding='same')
        # 32 x 32 x 3
        self.efficientnet = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.DEFAULT)
        self.fc = torch.nn.Linear(self.efficientnet.classifier[1].out_features, 2)

    def forward(self, x):
        # x = self.pool1(F.relu(self.conv1(x)))
        x = self.conv1(x)
        x = self.efficientnet(x)
        x = self.fc(x)
        return x

In [101]:
import time

def train(net, train_dataloader, val_dataloader, criterion, optimizer, 
          scheduler=None, epochs=10, device='cpu', checkpoint_epochs=10):
    start = time.time()
    print(f'Training for {epochs} epochs on {device}')
    
    for epoch in range(1,epochs+1):
        print(f"Epoch {epoch}/{epochs}")
        
        # put network in train mode for Dropout and Batch Normalization
        net.train()
        # loss and accuracy tensors are on the GPU to avoid data transfers
        train_loss = torch.tensor(0., device=device)  
        train_accuracy = torch.tensor(0., device=device)
        for X, y in train_dataloader:
            X = X.to(device)
            y = y.to(device)
            preds = net(X)
            loss = criterion(preds, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            with torch.no_grad():
                train_loss += loss * train_dataloader.batch_size
                train_accuracy += (torch.argmax(preds, dim=1) == y).sum()
        
        if val_dataloader is not None:
            # put network in train mode for Dropout and Batch Normalization
            net.eval()
            valid_loss = torch.tensor(0., device=device)
            valid_accuracy = torch.tensor(0., device=device)
            with torch.no_grad():
                for X, y in val_dataloader:
                    X = X.to(device)
                    y = y.to(device)
                    preds = net(X)
                    loss = criterion(preds, y)

                    valid_loss += loss * val_dataloader.batch_size
                    valid_accuracy += (torch.argmax(preds, dim=1) == y).sum()
        
        if scheduler is not None: 
            scheduler.step()
            
        print(f'Training loss: {train_loss/len(train_dataloader.dataset):.2f}')
        print(f'Training accuracy: {100*train_accuracy/len(train_dataloader.dataset):.2f}')
        
        if val_dataloader is not None:
            print(f'Valid loss: {valid_loss/len(val_dataloader.dataset):.2f}')
            print(f'Valid accuracy: {100*valid_accuracy/len(val_dataloader.dataset):.2f}')
        
        if epoch%checkpoint_epochs==0:
            torch.save({
                'epoch': epoch,
                'state_dict': net.state_dict(),
                'optimizer': optimizer.state_dict(),
            }, './checkpoint.pth.tar')
        
        print()
    
    end = time.time()
    print(f'Total training time: {end-start:.1f} seconds')
    return net

In [103]:
# net = ConvNet().to(device)
net = EfficientNet().to(device)

efficientnet_params = [param for name, param in net.named_parameters() 
                       if 'fc' not in str(name) and 'conv1' not in str(name)]
added_params = [param for name, param in net.named_parameters() 
                if 'fc' in str(name) or 'conv1' in str(name)]

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([{'params':efficientnet_params},
                              {'params': added_params, 'lr': 1e-6}],
                             lr=1e-5)

train(net, train_loader, val_loader, criterion, optimizer, epochs=25, device=device)

Training for 25 epochs on cuda
Epoch 1/25
Training loss: 0.78
Training accuracy: 53.90
Valid loss: 5.25
Valid accuracy: 56.16

Epoch 2/25
Training loss: 0.71
Training accuracy: 58.24
Valid loss: 4.20
Valid accuracy: 59.36

Epoch 3/25
Training loss: 0.68
Training accuracy: 60.03
Valid loss: 189.95
Valid accuracy: 59.86

Epoch 4/25
Training loss: 0.65
Training accuracy: 63.32
Valid loss: 0.95
Valid accuracy: 61.52

Epoch 5/25
Training loss: 0.63
Training accuracy: 65.75
Valid loss: 4.86
Valid accuracy: 63.08

Epoch 6/25
Training loss: 0.60
Training accuracy: 68.67
Valid loss: 9.73
Valid accuracy: 63.16

Epoch 7/25
Training loss: 0.59
Training accuracy: 69.11
Valid loss: 2.15
Valid accuracy: 64.70

Epoch 8/25
Training loss: 0.57
Training accuracy: 71.68
Valid loss: 4.40
Valid accuracy: 64.74

Epoch 9/25
Training loss: 0.56
Training accuracy: 72.03
Valid loss: 1.48
Valid accuracy: 66.34

Epoch 10/25
Training loss: 0.54
Training accuracy: 73.52
Valid loss: 6.36
Valid accuracy: 66.80

Epoch 

EfficientNet(
  (conv1): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (efficientnet): EfficientNet(
    (features): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (2): SiLU(inplace=True)
      )
      (1): Sequential(
        (0): FusedMBConv(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
              (2): SiLU(inplace=True)
            )
          )
          (stochastic_depth): StochasticDepth(p=0.0, mode=row)
        )
        (1): FusedMBConv(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(24, 24, kernel_size=(3,

validation

In [104]:
test_df = pd.read_csv('data/validation-dataset.csv', encoding='windows-1251')

X_test = np.load('data/ft-embed/test-embeddings.npy')
y_test = test_df['label'].to_list()
test_set = MyDataset(X_test, y_test)
test_loader = torch.utils.data.DataLoader(test_set.X, batch_size=16,
                                          shuffle=False, num_workers=2)

In [105]:
def predict(net, test_dataloader, device='cpu'):
    test_labels = []
    proba = []
    net.eval()
    with torch.no_grad():
        for data in tqdm(test_dataloader):
            outputs = net(data.to(device))
            _, preds = torch.max(outputs.data , 1)
            test_labels.append(preds.cpu().tolist())
            proba.append(outputs.cpu().tolist())
    test_labels = sum(test_labels, [])
    proba = sum(proba, [])
    return proba, test_labels

In [106]:
import evaluate

proba, preds = predict(net, test_loader, device=device)

metric = evaluate.load('accuracy')
metric.compute(references=y_test, predictions=preds)

100%|██████████| 625/625 [00:09<00:00, 64.60it/s]


{'accuracy': 0.7051}

### Perceptron

In [202]:
df = pd.read_csv('data/hackaton_result_dataset.csv', encoding='windows-1251')
val_df = pd.read_csv('data/validation-dataset.csv', encoding='windows-1251')
val_df = val_df.head(5000)

In [203]:
X_train = np.load('data/ft-embed/train-embeddings.npy')
y_train = df['label'].to_list()

X_val = np.load('data/ft-embed/validation-embeddings.npy')
y_val = val_df['label'].to_list()

print(X_train.shape)
print(X_val.shape)

(6508, 1024)
(5000, 1024)


In [216]:
class MyDataset (torch.utils.data.Dataset):
    def __init__ (self, X, y):
        self.X = torch.Tensor(X)
        # if BCE
        self.y = torch.Tensor(y).unsqueeze(1)
        # if crossentropy
        # self.y = torch.Tensor(y).to(torch.int8).tolist()
    def __len__(self):
        return self.X.shape[0]
    def __getitem__ (self, index):
        return (self.X[index], self.y[index])

In [217]:
train_set = MyDataset(X_train, y_train)
val_set = MyDataset(X_val, y_val)

batch_size = 16
num_workers = 4
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size,
                                           shuffle=True, num_workers=num_workers)

val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size,
                                         shuffle=True, num_workers=num_workers)

In [247]:
class Perceptron(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.dropout1 = torch.nn.Dropout(0.2)
        self.layer1 = torch.nn.Linear(1024, 512)
        self.act1 = torch.nn.ReLU()
        self.dropout2 = torch.nn.Dropout(0.1)
        self.layer2 = torch.nn.Linear(512, 256)
        self.act2 = torch.nn.ReLU()
        self.dropout3 = torch.nn.Dropout(0.1)
        self.layer3 = torch.nn.Linear(256, 128)
        self.act3 = torch.nn.ReLU()
        self.dropout4 = torch.nn.Dropout(0.1)
        self.output = torch.nn.Linear(128, 1)
        self.sigmoid = torch.nn.Sigmoid()
 
    def forward(self, x):
        x = self.dropout1(x)
        x = self.act1(self.layer1(x))
        x = self.dropout2(x)
        x = self.act2(self.layer2(x))
        x = self.dropout3(x)
        x = self.act3(self.layer3(x))
        x = self.dropout4(x)
        # x = self.output(x)
        x = self.sigmoid(self.output(x))
        return x

In [248]:
def train(net, train_dataloader, val_dataloader, criterion, optimizer, 
          scheduler=None, epochs=10, device='cpu', checkpoint_epochs=10):
    start = time.time()
    print(f'Training for {epochs} epochs on {device}')
    
    for epoch in range(1,epochs+1):
        print(f"Epoch {epoch}/{epochs}")
        
        # put network in train mode for Dropout and Batch Normalization
        net.train()
        # loss and accuracy tensors are on the GPU to avoid data transfers
        train_loss = torch.tensor(0., device=device)  
        train_accuracy = torch.tensor(0., device=device)
        for X, y in train_dataloader:
            X = X.to(device)
            y = y.to(device)
            preds = net(X)
            loss = criterion(preds, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            with torch.no_grad():
                train_loss += loss * train_dataloader.batch_size
                # train_accuracy += (torch.argmax(preds, dim=1) == y).sum()
                train_accuracy += (torch.where(preds>0.5, 1, 0) == y).sum()
                # y_pred = preds.cpu().numpy()
                # y_true = y.cpu().numpy()
                # train_accuracy += (np.where(y_pred>0.5, 1, 0) == y_true).sum()
        
        if val_dataloader is not None:
            # put network in train mode for Dropout and Batch Normalization
            net.eval()
            valid_loss = torch.tensor(0., device=device)
            valid_accuracy = torch.tensor(0., device=device)
            with torch.no_grad():
                for X, y in val_dataloader:
                    X = X.to(device)
                    y = y.to(device)
                    preds = net(X)
                    loss = criterion(preds, y)
                    valid_loss += loss * val_dataloader.batch_size
                    # valid_accuracy += (torch.argmax(preds, dim=1) == y).sum()
                    valid_accuracy += (torch.where(preds>0.5, 1, 0) == y).sum()
                    # y_pred = preds.cpu().numpy()
                    # y_true = y.cpu().numpy()
                    # valid_accuracy += (np.where(y_pred>0.5, 1, 0) == y_true).sum()
                    
        
        if scheduler is not None: 
            scheduler.step()
            
        print(f'Training loss: {train_loss/len(train_dataloader.dataset):.2f}')
        print(f'Training accuracy: {100*train_accuracy/len(train_dataloader.dataset):.2f}')
        
        if val_dataloader is not None:
            print(f'Valid loss: {valid_loss/len(val_dataloader.dataset):.2f}')
            print(f'Valid accuracy: {100*valid_accuracy/len(val_dataloader.dataset):.2f}')
        
        if epoch%checkpoint_epochs==0:
            torch.save({
                'epoch': epoch,
                'state_dict': net.state_dict(),
                'optimizer': optimizer.state_dict(),
            }, './checkpoint.pth.tar')
        
        print()
    
    end = time.time()
    print(f'Total training time: {end-start:.1f} seconds')
    return net

In [249]:
net = Perceptron().to(device)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(),lr=1e-5)

train(net, train_loader, val_loader, criterion, optimizer, epochs=20, device=device)

Training for 20 epochs on cuda
Epoch 1/20
Training loss: 0.67
Training accuracy: 72.03
Valid loss: 0.65
Valid accuracy: 74.40

Epoch 2/20
Training loss: 0.48
Training accuracy: 88.94
Valid loss: 0.53
Valid accuracy: 74.80

Epoch 3/20
Training loss: 0.29
Training accuracy: 89.51
Valid loss: 0.57
Valid accuracy: 74.60

Epoch 4/20
Training loss: 0.27
Training accuracy: 89.81
Valid loss: 0.60
Valid accuracy: 74.50

Epoch 5/20
Training loss: 0.26
Training accuracy: 89.80
Valid loss: 0.61
Valid accuracy: 74.62

Epoch 6/20
Training loss: 0.26
Training accuracy: 89.81
Valid loss: 0.61
Valid accuracy: 74.76

Epoch 7/20
Training loss: 0.26
Training accuracy: 89.86
Valid loss: 0.61
Valid accuracy: 74.66

Epoch 8/20
Training loss: 0.26
Training accuracy: 89.94
Valid loss: 0.61
Valid accuracy: 74.82

Epoch 9/20
Training loss: 0.25
Training accuracy: 90.15
Valid loss: 0.61
Valid accuracy: 74.98

Epoch 10/20
Training loss: 0.25
Training accuracy: 90.01
Valid loss: 0.61
Valid accuracy: 75.06

Epoch 11

Perceptron(
  (dropout1): Dropout(p=0.2, inplace=False)
  (layer1): Linear(in_features=1024, out_features=512, bias=True)
  (act1): ReLU()
  (dropout2): Dropout(p=0.1, inplace=False)
  (layer2): Linear(in_features=512, out_features=256, bias=True)
  (act2): ReLU()
  (dropout3): Dropout(p=0.1, inplace=False)
  (layer3): Linear(in_features=256, out_features=128, bias=True)
  (act3): ReLU()
  (dropout4): Dropout(p=0.1, inplace=False)
  (output): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

evaluation

In [250]:
test_df = pd.read_csv('data/validation-dataset.csv', encoding='windows-1251')

X_test = np.load('data/ft-embed/test-embeddings.npy')
y_test = test_df['label'].to_list()
test_set = MyDataset(X_test, y_test)
test_loader = torch.utils.data.DataLoader(test_set.X, batch_size=16,
                                          shuffle=False, num_workers=2)

In [290]:
def predict(net, test_dataloader, device='cpu'):
    test_labels = []
    proba = []
    net.eval()
    with torch.no_grad():
        for data in tqdm(test_dataloader):
            outputs = net(data.to(device))
            preds = torch.flatten((outputs > 0.5).to(torch.int8))
            test_labels.append(preds.cpu().tolist())
            proba.append(torch.flatten(outputs).cpu().tolist())
    test_labels = sum(test_labels, [])
    proba = sum(proba, [])
    return proba, test_labels

In [251]:
# for crossentropy
# def predict(net, test_dataloader, device='cpu'):
#     test_labels = []
#     proba = []
#     net.eval()
#     with torch.no_grad():
#         for data in tqdm(test_dataloader):
#             outputs = net(data.to(device))
#             _, preds = torch.max(outputs.data , 1)
#             test_labels.append(preds.cpu().tolist())
#             proba.append(outputs.cpu().tolist())
#     test_labels = sum(test_labels, [])
#     proba = sum(proba, [])
#     return proba, test_labels

In [291]:
import evaluate

proba, preds = predict(net, test_loader, device=device)

metric = evaluate.load('accuracy')
metric.compute(references=y_test, predictions=preds)

100%|██████████| 625/625 [00:00<00:00, 912.19it/s] 


{'accuracy': 0.751}

In [286]:
torch.save(net.state_dict(), 'models/percetron-classifier.pt')

### AUC score

In [293]:
import evaluate
metric = evaluate.load('roc_auc')

In [294]:
model = Perceptron().to(device)
model.load_state_dict(torch.load('models/percetron-classifier.pt'))
model.eval()

Perceptron(
  (dropout1): Dropout(p=0.2, inplace=False)
  (layer1): Linear(in_features=1024, out_features=512, bias=True)
  (act1): ReLU()
  (dropout2): Dropout(p=0.1, inplace=False)
  (layer2): Linear(in_features=512, out_features=256, bias=True)
  (act2): ReLU()
  (dropout3): Dropout(p=0.1, inplace=False)
  (layer3): Linear(in_features=256, out_features=128, bias=True)
  (act3): ReLU()
  (dropout4): Dropout(p=0.1, inplace=False)
  (output): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [295]:
with torch.no_grad():
    print(model(test_set.X[0].to(device)))

tensor([0.4018], device='cuda:0')


In [296]:
test_df = pd.read_csv('data/validation-dataset.csv', encoding='windows-1251')

X_test = np.load('data/ft-embed/test-embeddings.npy')
y_test = test_df['label'].to_list()
test_set = MyDataset(X_test, y_test)
test_loader = torch.utils.data.DataLoader(test_set.X, batch_size=16,
                                          shuffle=False, num_workers=2)

In [297]:
def predict(net, test_dataloader, device='cpu'):
    test_labels = []
    proba = []
    net.eval()
    with torch.no_grad():
        for data in tqdm(test_dataloader):
            outputs = net(data.to(device))
            preds = torch.flatten((outputs > 0.5).to(torch.int8))
            test_labels.append(preds.cpu().tolist())
            proba.append(torch.flatten(outputs).cpu().tolist())
    test_labels = sum(test_labels, [])
    proba = sum(proba, [])
    return proba, test_labels

In [298]:
proba, preds = predict(model, test_loader, device=device)

metric.compute(references=y_test, prediction_scores=proba)

100%|██████████| 625/625 [00:00<00:00, 907.44it/s]


{'roc_auc': 0.8110031129934587}