In [1]:
!pip install transformers
!pip install torchtext==0.9.0
!pip install torch==1.8.0



In [2]:
import torch
import numpy as np
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from torchtext.legacy.data import Field
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from google.colab import drive

drive.mount('/content/drive')
sources_path = "/content/drive/MyDrive/dl-samples"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Preprocessing

In [4]:
def get_vocab(X):
    words = [sentence.split() for sentence in X]
    text_field = Field()
    text_field.build_vocab(words, max_size=10000)
    return text_field

def pad(seq, maxlen):
    if len(seq) < maxlen:
        seq = seq + ['<pad>'] * (maxlen - len(seq))
    return seq

def to_indices(vocab, words):
    return [vocab.stoi[w] for w in words]

In [5]:
def tokenize(X, y, model = 'simple', field=None, teacher_output = None, tokenizer = None):
    X_split = [t.split() for t in X]
    X_pad = [pad(s, maxlen) for s in X_split]

    if model == 'bert':
        lines = [" ".join(s) for s in X_pad]
        masks = [[int(word != '<pad>') for word in sentence] for sentence in X_pad]
        inds = [tokenizer.encode(line.split(), add_special_tokens=False) for line in lines]
        inds = torch.tensor(inds)
        masks = torch.tensor(masks, dtype=torch.int8)
        torch_y = torch.tensor(y, dtype=torch.float)
        return TensorDataset(inds, torch_y, masks)

    else:
        X_index = [to_indices(field.vocab, s) for s in X_pad]
        torch_x = torch.tensor(X_index, dtype=torch.long)
        torch_y = torch.tensor(y, dtype=torch.float)

        if model == 'distil':
            return TensorDataset(torch_x, torch_y, teacher_output)
        else:
            return TensorDataset(torch_x, torch_y)

In [6]:
def read_and_preprocess(path):
    X, y, maxlen = [], [], 0
    with open(sources_path + path, encoding = "ISO-8859-1") as file:
        for line in file:
            words = line.split()
            y.append(0 if words[0] == 'ham' else 1)
            X.append(' '.join(words[1:]))
            maxlen = max(maxlen, len(words))
    return X, y, maxlen

## Utility classes and methods
### Metrics

In [7]:
class DistilLoss(nn.Module):
    def __init__(self, alpha=0.5):
        super(DistilLoss, self).__init__()
        self.alpha = alpha
    
    def forward(self, real_prediction, real_output, teacher_prediction, teacher_output):
        bce = nn.CrossEntropyLoss()
        mse = nn.MSELoss()
        prediction_loss = bce(real_prediction, torch.tensor(real_output, dtype=torch.long))
        teacher_loss = mse(teacher_prediction, teacher_output)
        return self.alpha * prediction_loss + (1 - self.alpha) * teacher_loss

In [8]:
def get_teacher_output(teacher, dataset):
    teacher = teacher.to(device)
    dataloader = DataLoader(dataset, 32, shuffle=False)
    teacher_output = []
    with torch.no_grad():
      for info in tqdm(dataloader):
          info=[t.to(device) for t in info]
          loss, result = teacher(info)
          teacher_output.append(result)
      teacher_output = torch.cat(teacher_output)
    return teacher_output

### Modules

In [9]:
class BiLSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, \
                bidirectional, dropout, num_layers):
        super(BiLSTM, self).__init__()
        
        self.input_dim = input_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.LSTM(
                            input_size=embedding_dim, 
                            hidden_size=hidden_dim, 
                            num_layers=num_layers,
                            bidirectional=bidirectional,
                            dropout=dropout
                        )
        
        self.label_prediction = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def init_state(self, batch_size):
        return torch.zeros(2 * self.num_layers, batch_size, self.hidden_dim), \
               torch.zeros(2 * self.num_layers, batch_size, self.hidden_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        x = torch.transpose(x, dim0=1, dim1=0)
        x, hidden = self.rnn(x)
        hidden, cell = hidden
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        label_prediction = self.label_prediction(hidden)
        return label_prediction

In [10]:
class ClassificationHead(nn.Module):
    def __init__(self, input_size, hidden_size, num_labels=2):
        super(ClassificationHead, self).__init__()
        
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_labels)
        
    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        return self.fc3(x)

### Models

In [11]:
class TeacherModel(nn.Module):
    def __init__(self, teacher, head_hidden_size=128):
        super(TeacherModel, self).__init__()
        self.teacher = teacher
        hidden_size = self.teacher.config.hidden_size
        self.classification_head = ClassificationHead(hidden_size, head_hidden_size, 2)
        self.loss = nn.CrossEntropyLoss()
    
    def forward(self, inp):
        inds = inp[0]
        labels = inp[1]
        masks = inp[2]
        labels = torch.tensor(labels, dtype=torch.long)
        output = self.teacher(inds, attention_mask=masks)[0]
        output = output[:, 0, :]
        prediction = self.classification_head(output)
        loss = self.loss(prediction, labels)
        return loss, prediction
    
    def parameters(self):
        return self.classification_head.parameters()

In [12]:
class SimpleModel(nn.Module):
    def __init__(self, bilstm):
        super(SimpleModel, self).__init__()
        self.bilstm = bilstm
        self.loss = nn.CrossEntropyLoss()
    
    def forward(self, inp):
        inds = inp[0]
        labels = inp[1]
        labels = torch.tensor(labels, dtype=torch.long)
        prediction = self.bilstm(inds)
        loss = self.loss(prediction, labels)
        return loss, prediction


In [13]:
class DistilModel(nn.Module):
    def __init__(self, student, alpha=0.5):
        super(DistilModel, self).__init__()
        self.student = student
        self.loss = DistilLoss(alpha)
    
    def forward(self, inp):
        inds = inp[0]
        labels = inp[1]
        teacher_output = inp[2]
        labels = torch.tensor(labels, dtype=torch.long)
        label_prediction = self.student(inds)
        loss = self.loss(label_prediction, labels, label_prediction, teacher_output)
        return loss, label_prediction


### Trainer

In [27]:
def train(model, dataset, test_dataset, epochs=5, batch_size=64):
    dataloader = DataLoader(dataset, batch_size, shuffle=True)
    optimizer = optim.Adam(model.parameters())
    model = model.to(device)
    model.train()
    for e in range(epochs):
        losses = 0
        count = 0
        for info in tqdm(dataloader):
            info=[t.to(device) for t in info]
            loss, _ = model(info)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses += loss
            count += 1
        losses /= count
        print(f'Epoch {e} \t| Loss: {losses.item()}')
        
    dataloader = DataLoader(test_dataset, batch_size, shuffle=True)
    count = 0
    correct = 0
    with torch.no_grad():
      for info in tqdm(dataloader):
          info=[t.to(device) for t in info]
          _, prediction = model(info)
          prediction = torch.argmax(prediction, axis=1)
          correct += torch.sum(prediction == info[1].int())
          count += batch_size
    print(f'Accuracy:{correct / count}')


## Experiments
Сперва загрузим данные и разобьем их на трейн и тест

In [28]:
X, y, maxlen = read_and_preprocess('/spam.txt')
field = get_vocab(X)
vocab_size = len(field.vocab.stoi.keys())

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25)

Для каждой из сравниваемых моделей проделаем следующее:
1.   загружаем токенайзер и модель BERT с hugginface
2.   препроцессим для неё данные
3.   обучаем модель

### Teacher model

In [30]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')

teacher = TeacherModel(bert)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
teacher_dataset_train = tokenize(X_train, y_train, tokenizer=tokenizer, model='bert')
teacher_dataset_test = tokenize(X_test, y_test, tokenizer=tokenizer, model='bert')

In [32]:
train(teacher, teacher_dataset_train, teacher_dataset_test, epochs=5, batch_size=16)

  0%|          | 0/262 [00:00<?, ?it/s]

  del sys.path[0]


Epoch 0 	| Loss: 0.29179251194000244


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 1 	| Loss: 0.24656671285629272


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 2 	| Loss: 0.2315017282962799


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 3 	| Loss: 0.22430169582366943


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 4 	| Loss: 0.22168691456317902


  0%|          | 0/88 [00:00<?, ?it/s]

Accuracy:0.9105113744735718


### Simple model
Обучаем BiLSTM (без модели-учителя)

In [33]:
model = BiLSTM(input_dim=vocab_size, 
               embedding_dim=16,
               hidden_dim=16, 
               output_dim=2,
               bidirectional=True,
               dropout=0.5,
               num_layers=1)

simple_model = SimpleModel(model)

  "num_layers={}".format(dropout, num_layers))


In [34]:
simple_dataset_train = tokenize(X_train, y_train, field=field, model='simple')
simple_dataset_test = tokenize(X_test, y_test, field=field, model='simple')

In [35]:
train(simple_model, simple_dataset_train,simple_dataset_test, epochs=5, batch_size=16)

  0%|          | 0/262 [00:00<?, ?it/s]

  # Remove the CWD from sys.path while we load stuff.


Epoch 0 	| Loss: 0.3892906606197357


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 1 	| Loss: 0.24236096441745758


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 2 	| Loss: 0.1754378229379654


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 3 	| Loss: 0.12833765149116516


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 4 	| Loss: 0.10344188660383224


  0%|          | 0/88 [00:00<?, ?it/s]

Accuracy:0.9382102489471436


### Distil model
Обучаем BiLSTM (с моделью-учителем)

In [36]:
teacher_output_train = get_teacher_output(teacher, teacher_dataset_train)
teacher_output_test = get_teacher_output(teacher, teacher_dataset_test)

distil_dataset_train = tokenize(X_train, y_train, teacher_output=teacher_output_train, field=field, model='distil')
distil_dataset_test = tokenize(X_test, y_test, teacher_output=teacher_output_test, field=field, model='distil')

  0%|          | 0/131 [00:00<?, ?it/s]

  del sys.path[0]


  0%|          | 0/44 [00:00<?, ?it/s]

In [37]:
distil_model = BiLSTM(input_dim=vocab_size, 
               embedding_dim=16,
               hidden_dim=16, 
               output_dim=2,
               bidirectional=True,
               dropout=0.8,
               num_layers=1)
student = DistilModel(distil_model, alpha=0.5)

  "num_layers={}".format(dropout, num_layers))


In [38]:
train(student, distil_dataset_train, distil_dataset_test, epochs=5, batch_size=16)


  0%|          | 0/262 [00:00<?, ?it/s]

  # This is added back by InteractiveShellApp.init_path()
  if __name__ == '__main__':


Epoch 0 	| Loss: 1.8719578981399536


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 1 	| Loss: 1.3209822177886963


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 2 	| Loss: 1.2536596059799194


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 3 	| Loss: 1.1691200733184814


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 4 	| Loss: 1.1593974828720093


  0%|          | 0/88 [00:00<?, ?it/s]

Accuracy:0.8529829978942871
