In [11]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import pandas as pd
import torch
import os
import warnings

In [12]:
import torch
torch.cuda.empty_cache()


In [13]:
# Suppress warnings
warnings.simplefilter("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [14]:
# Define the Dataset class
class PretrainingDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        tokenized = self.tokenizer(
            text=text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt",
        )
        return {
            "input_ids": tokenized["input_ids"].squeeze(),
            "attention_mask": tokenized["attention_mask"].squeeze(),
            "labels": tokenized["input_ids"].squeeze(),
        }


In [15]:
# Load data
DATA_PATH = "train.csv"
data = pd.read_csv(DATA_PATH)
#texts = data["full_text"].values
texts = data["full_text"].values[:500]  # Use first 500 samples


In [16]:
# Load model and tokenizer
'''
MODEL_NAME_OR_PATH = "microsoft/deberta-v3-base"
'''
MODEL_NAME_OR_PATH = "distilbert-base-uncased"
MAX_LENGTH = 512


In [17]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME_OR_PATH)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [18]:
# Create dataset
dataset = PretrainingDataset(
    texts=texts,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
)


In [19]:
# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

In [20]:
training_args = TrainingArguments(
    output_dir="./results",
    run_name="masked_lm_experiment",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    report_to="none",  # Disable W&B
)


In [21]:
from sklearn.model_selection import train_test_split

# Split the dataset
train_texts, eval_texts = train_test_split(texts, test_size=0.2, random_state=42)

# Create evaluation dataset
eval_dataset = PretrainingDataset(
    texts=eval_texts,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
)

# Initialize Trainer with eval_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=eval_dataset,  # Add evaluation dataset
    data_collator=data_collator,
)


In [22]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [23]:
# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

Epoch,Training Loss,Validation Loss
1,No log,2.324391
2,No log,2.266984
3,No log,2.161151


{'eval_loss': 2.156226396560669,
 'eval_runtime': 2.0969,
 'eval_samples_per_second': 47.689,
 'eval_steps_per_second': 6.2,
 'epoch': 3.0}

In [25]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add a padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Resize the model's embedding layer to accommodate the new token
model.resize_token_embeddings(len(tokenizer))

# Define custom text inputs
custom_texts = [ "NLP is based on transformers."]

# Tokenize the custom texts and return attention mask
custom_encodings = tokenizer(custom_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="pt", return_attention_mask=True)

# Explicitly set pad_token_id
pad_token_id = tokenizer.pad_token_id

generated_text = model.generate(
    input_ids=custom_encodings['input_ids'],
    attention_mask=custom_encodings['attention_mask'],
    max_length=50,
    pad_token_id=pad_token_id,
    top_p=0.9,  # Nucleus sampling (probability threshold)
    temperature=0.7,  # Control randomness
    no_repeat_ngram_size=2  # Avoid repeating n-grams
)



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [26]:

decoded_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)
print(f"Generated Text: {decoded_text}")


Generated Text: NLP is based on transformers.

The first step is to create a transformer. The first transformer is a simple one. It is used to transform a single value into a number. For example, if we have a value of 1,


In [34]:
from torch.nn.functional import softmax

# Define a custom text with a partial sentence
partial_sentence = "I had breakfast this morning and i loved the way it"

# Tokenize the partial sentence
input_ids = tokenizer(partial_sentence, return_tensors="pt")["input_ids"]

# Get the model output logits
outputs = model(input_ids=input_ids)
logits = outputs.logits

# Extract the logits for the last token
last_token_logits = logits[0, -1, :]

# Apply softmax to get probabilities
probs = softmax(last_token_logits, dim=-1)

# Get the top 10 possible next words
top_k = 10
top_k_indices = torch.topk(probs, top_k).indices
top_k_words = [tokenizer.decode([idx]) for idx in top_k_indices]

# Print the results
print("Top possible next words:")
for word, prob in zip(top_k_words, probs[top_k_indices]):
    print(f"{word}: {prob.item():.4f}")


Top possible next words:
 looked: 0.1540
 was: 0.1375
 turned: 0.1069
 tasted: 0.0813
 smelled: 0.0789
 cooked: 0.0390
 came: 0.0249
 made: 0.0224
 reminded: 0.0174
 smells: 0.0108


In [40]:
import torch
from torch.nn.functional import softmax
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load tokenizer and model
MODEL_NAME = "bert-base-uncased"  # Example model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)

# Define a custom text with a masked word
masked_sentence = "NLP is [MASK] on transformers."

# Tokenize the sentence
inputs = tokenizer(masked_sentence, return_tensors="pt")

# Ensure the input is a tensor
input_ids = inputs["input_ids"]

# Get model predictions
outputs = model(**inputs)
logits = outputs.logits

# Locate the [MASK] token index
mask_token_index = (input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]

# Get the logits for the [MASK] token
mask_logits = logits[0, mask_token_index, :].squeeze()

# Apply softmax to get probabilities
probs = softmax(mask_logits, dim=-1)

# Get the top 10 possible words for the [MASK]
top_k = 10
top_k_indices = torch.topk(probs, top_k).indices
top_k_words = [tokenizer.decode([idx]) for idx in top_k_indices]

# Print the results
print("Top possible words for [MASK]:")
for word, prob in zip(top_k_words, probs[top_k_indices]):
    print(f"{word}: {prob.item():.4f}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

Top possible words for [MASK]:
based: 0.3139
used: 0.2739
focused: 0.0842
dependent: 0.0466
deployed: 0.0137
useful: 0.0126
standard: 0.0102
installed: 0.0096
available: 0.0080
built: 0.0079
