In [55]:
from torch.utils.data import DataLoader, Dataset
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, AutoConfig
from torch.utils.data import TensorDataset, DataLoader

In [6]:
path_to_data="../data"
model_name="distilroberta-base"
ground_truth="risk_golden_truth.txt"
processed_folder="processed"

labels = []
ext = ".txt"
path_to_processed = os.path.join(path_to_data, processed_folder)
path_labels = os.path.join(path_to_data, ground_truth)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
with open(path_labels, "r") as f:
    for line in f:
        subject, label = line.split()
        labels.append((subject, label))
    # print(labels)

In [11]:
subject, label = labels[0]
subject_path = os.path.join(path_to_processed, subject) + ext
print(subject_path)

../data/processed/subject3450.txt


In [32]:
with open(subject_path, "r") as f:
    text = " ".join(f.readlines())
text = tokenizer.encode_plus(
    text,
    add_special_tokens=True,
    max_length=512,
    padding="max_length",
    # return_tensors='pt',
    truncation=True,
)
x= {
            "ids": torch.tensor(text["input_ids"], dtype=torch.long),
            "mask": torch.tensor(text["attention_mask"], dtype=torch.long),
        }


In [44]:
class BaseDataset(Dataset):
    def __init__(
        self,
        path_to_data="../data",
        model_name="distilroberta-base",
        ground_truth="risk_golden_truth.txt",
        processed_folder="processed",
    ):
        self.labels = []
        self.ext = ".txt"

        self.path_to_data = path_to_data
        self.path_to_processed = os.path.join(self.path_to_data, processed_folder)
        self.path_labels = os.path.join(self.path_to_data, ground_truth)

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        with open(self.path_labels, "r") as f:
            for line in f:
                subject, label = line.split()
                self.labels.append((subject, label))

    def __getitem__(self, idx):
        subject, label = self.labels[idx]
        subject_path = os.path.join(self.path_to_processed, subject) + self.ext
        with open(subject_path, "r") as f:
            text = " ".join(f.readlines())
        text = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding="max_length",
            return_token_type_ids=True,
            # return_tensors='pt',
            # return_attention_mask=True,
            truncation=True,
        )
        return {
            "ids": torch.tensor(text["input_ids"], dtype=torch.long),
            "mask": torch.tensor(text["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(int(label), dtype=torch.float),
        }

    def __len__(self):
        return len(self.labels)

dl = DataLoader(BaseDataset(), batch_size=2)

In [68]:
config = AutoConfig.from_pretrained('distilroberta-base', num_labels=1)
model = AutoModel.from_pretrained('distilroberta-base', config=config)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [69]:
for x in dl:
    output =  model(x['ids'],x['mask'])
    break

In [73]:
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [66]:
torch.sigmoid(output.logits)

tensor([[0.4732],
        [0.4730]], grad_fn=<SigmoidBackward0>)

In [34]:
model2 = AutoModel.from_pretrained('distilroberta-base')

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
for batch in data_loader:
    ids, mask, labels = batch["ids"], batch["mask"], batch["labels"]
    out = model(ids, mask).logits
    break

In [8]:
model.classifier.out_proj.out_features=1

In [11]:
import torch

In [12]:
torch.nn.functional.softmax(out, dim=1)

tensor([[0.5897, 0.4103]], grad_fn=<SoftmaxBackward0>)

In [16]:
model.roberta(ids,mask)[0][:,0].shape

torch.Size([1, 768])