In [3]:
import torch
import time
from torch import nn
import torch.nn.functional as F
from keras_preprocessing.sequence import pad_sequences
from transformers import BertTokenizerFast, BertConfig, BertModel

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
# device = torch.device("cpu")

  from .autonotebook import tqdm as notebook_tqdm


There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1660 Ti


In [4]:
# Initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True)

# Initializing a model (with random weights) from the bert-base-uncased style configuration
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False)

# Accessing the model configuration
configuration = model.config

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'bert.pooler.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [6]:
class CFG:
    class data:
        batch_size=16
        validation_size = 0.2
        lr = 1e-3 #5e-5
        epochs = 10
        epsilon = 1e-8
        MAX_LEN = 128 #max sentence length
        seed_val = 42 #random seed
        k_folds = 10
        hidden_size = 768 #hidden layer size (embedding size) for feedforward net
        PATH = "./hs.pth"
        
        #defaults for CNN
        dropout = 0.2
        Ks = [1,2,3,4]
        kernel_num = 3 #number of filters for each conv layer
        input_shape = [-1, 1, 128, 768]

In [7]:
class Net(nn.Module):
    def __init__(self, input_shape):
        '''
        input_shape -> tuple (n,c,h,w)
        n = batch size
        c = num channels
        h = height
        w = width(768)
        '''
        super().__init__()
        #set default values for conv net
        dropout = CFG.data.dropout
        Ks = CFG.data.Ks
        Co = CFG.data.kernel_num #number of filters for each conv layer
        D = input_shape[3] 
        
        self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, (K, D)) for K in Ks])
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(len(Ks) * Co, 2)
        self.sigmoid = nn.Sigmoid()
        
        self.tanh = nn.Tanh()
        self.relu = nn.ReLU()
        self.gelu = nn.GELU()
        self.leakyrelu = nn.LeakyReLU()
                
    def forward(self, x):
        
        x = [self.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)

        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
        
        x = torch.cat(x, 1)
        x = self.dropout(x)  # (N, len(Ks)*Co)
        x = self.fc1(x)
        return x


In [8]:
net = Net(CFG.data.input_shape).to(device)
net.load_state_dict(torch.load(CFG.data.PATH))
net.eval()
model.eval()
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [13]:
def prepareData(sent):
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    input_ids = pad_sequences([input_ids], maxlen=CFG.data.MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")[0]
    att_mask = (input_ids>0).astype(int)
    input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)
    att_mask = torch.tensor(att_mask).unsqueeze(0).to(device)
    outputs = model(input_ids, 
                token_type_ids=None, 
                attention_mask=att_mask).last_hidden_state
    out = net(outputs.unsqueeze(0))
    return int(out[0].argmax())

In [14]:
st = time.time()
print(prepareData("Women are made to be abused"))
print(time.time() - st)

1
0.036067962646484375
