In [17]:
from torch.utils.data import DataLoader, Dataset
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel

In [2]:
class TXTDataset(Dataset):
    def __init__(self, path_to_data="../data"):
        self.labels = []
        self.path_to_data = path_to_data
        self.ext = ".txt"
        self.path_to_processed= os.path.join(self.path_to_data, "processed")
        self.tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
        path_labels = os.path.join(path_to_data, "risk_golden_truth.txt")
        with open(path_labels, 'r') as f:
            for line in f:
                subject, label = line.split()
                self.labels.append((subject, label))
    
    def __getitem__(self, idx):
        subject, label = self.labels[idx]
        subject_path = os.path.join(self.path_to_processed, subject) + self.ext
        with open(subject_path, 'r') as f:
            text = " ".join(f.readlines())
        text = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=512,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True,
        )
        return {
            "ids": torch.tensor(text["input_ids"], dtype=torch.long),
            "mask": torch.tensor(text["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(int(label), dtype=torch.float),
        }

    def __len__(self):
        return len(self.labels)




In [3]:
ds = TXTDataset()
data_loader = DataLoader(ds, batch_size=1, num_workers=8)

In [4]:
model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base')

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.bias

In [18]:
model2 = AutoModel.from_pretrained('distilroberta-base')

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
print(model2)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [10]:
for batch in data_loader:
    ids, mask, labels = batch["ids"], batch["mask"], batch["labels"]
    out = model(ids, mask).logits
    break

In [8]:
model.classifier.out_proj.out_features=1

In [11]:
import torch

In [12]:
torch.nn.functional.softmax(out, dim=1)

tensor([[0.5897, 0.4103]], grad_fn=<SoftmaxBackward0>)

In [16]:
model.roberta(ids,mask)[0][:,0].shape

torch.Size([1, 768])