In [52]:
import torch
from transformers import DistilBertForMaskedLM, DistilBertTokenizer, Trainer, TrainingArguments, DistilBertModel
from datasets import load_dataset, concatenate_datasets
from torch.utils.data import Dataset
import numpy as np
from transformers import DistilBertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
dataset = load_dataset('imdb')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [4]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

In [5]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [6]:
class MLMDataset(Dataset):
    def __init__(self, tokenized_dataset):
        self.input_ids = tokenized_dataset["input_ids"]
        self.attention_mask = tokenized_dataset["attention_mask"]
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        # Возвращаем CPU-тензоры (без .to(device))
        input_ids = torch.tensor(self.input_ids[idx])
        attention_mask = torch.tensor(self.attention_mask[idx])
        labels = input_ids.clone()
        
        # Маскирование (остается без изменений)
        probability_matrix = torch.full(labels.shape, 0.15)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        
        special_tokens_mask = torch.tensor(
            tokenizer.get_special_tokens_mask(labels.tolist(), already_has_special_tokens=True),
            dtype=torch.bool
        )
        masked_indices = masked_indices & ~special_tokens_mask
        
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        input_ids[indices_replaced] = tokenizer.mask_token_id
        
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
        input_ids[indices_random] = random_words[indices_random]
        
        return {
            "input_ids": input_ids,  # Оставляем на CPU
            "attention_mask": attention_mask,  # Оставляем на CPU
            "labels": labels  # Оставляем на CPU
        }

In [7]:
train_unsupervised = dataset["train"].train_test_split(test_size=0.1)["train"] 
unsupervised_data = dataset["unsupervised"].train_test_split(test_size=0.9)["train"]  

In [8]:
combined_dataset = concatenate_datasets([train_unsupervised, unsupervised_data])

In [9]:
tokenized_combined = combined_dataset.map(tokenize_function, batched=True)
mlm_dataset = MLMDataset(tokenized_combined)

Map:   0%|          | 0/27500 [00:00<?, ? examples/s]

In [10]:
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased').to(device)

In [11]:
training_args = TrainingArguments(
    output_dir="./mlm_results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=100,
    logging_dir="./mlm_logs",
    fp16=True, 
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=mlm_dataset,
)

In [13]:
trainer.train()

Step,Training Loss
100,1.3285
200,0.2151
300,0.2114
400,0.2093
500,0.2109
600,0.2074
700,0.2044
800,0.2009
900,0.21
1000,0.2004


TrainOutput(global_step=10314, training_loss=0.19817728729980363, metrics={'train_runtime': 2655.1145, 'train_samples_per_second': 31.072, 'train_steps_per_second': 3.885, 'total_flos': 1.093629537792e+16, 'train_loss': 0.19817728729980363, 'epoch': 3.0})

In [14]:
model.save_pretrained("./distilbert_imdb_mlm")

In [38]:
class IMDBClassificationDataset(Dataset):
    def __init__(self, tokenized_dataset):
        self.input_ids = tokenized_dataset["input_ids"]
        self.attention_mask = tokenized_dataset["attention_mask"]
        self.labels = tokenized_dataset["label"]  # Обратите внимание - используем "label", а не "labels"
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx]),
            "attention_mask": torch.tensor(self.attention_mask[idx]),
            "labels": torch.tensor(self.labels[idx])  # Одна метка на текст
        }

In [39]:
# Создаем датасеты
train_dataset = IMDBClassificationDataset(tokenized_datasets["train"])
test_dataset = IMDBClassificationDataset(tokenized_datasets["test"])

In [46]:
model = DistilBertForSequenceClassification.from_pretrained("./distilbert_imdb_mlm", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ./distilbert_imdb_mlm and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
batch_size = 16
epochs = 3
learning_rate = 2e-5

In [40]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
batch = next(iter(train_loader))
print(batch["input_ids"].shape)  # torch.Size([16, 512])
print(batch["labels"].shape)     # Теперь должно быть torch.Size([16])

torch.Size([16, 512])
torch.Size([16])


In [43]:
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)

In [67]:
text = ["War", 'Piece']
tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

In [53]:
model = DistilBertModel.from_pretrained("./distilbert_imdb_mlm")
model.to(device)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L