In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from glob import glob
import os
from transformers import BertConfig, BertForMaskedLM,AdamW
import time
import numpy as np
from tqdm import tqdm
import psutil
import matplotlib.pyplot as plt
# Function to get current memory usage
def get_gpu_memory_usage():
    # Returns the current GPU memory usage in MB
    allocated = torch.cuda.memory_allocated() / (1024 * 1024)
    cached = torch.cuda.memory_reserved() / (1024 * 1024)
    return allocated, cached

# Define the text files dataset
# Define the text files dataset
class TextFolderDataset(Dataset):
    def __init__(self, file_directory, file_pattern, tokenizer, max_length):
        self.filepaths = glob(os.path.join(file_directory, file_pattern))
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        with open(self.filepaths[idx], 'r', encoding='utf-8') as file:
            text = file.read()
        encoding = self.tokenizer.encode(text)
        input_ids = encoding.ids[:self.max_length] + [0] * (self.max_length - len(encoding.ids[:self.max_length]))
        attention_mask = [1] * len(encoding.ids[:self.max_length]) + [0] * (self.max_length - len(encoding.ids[:self.max_length]))
        
        # Create labels (for masked language modeling, you might mask some tokens here)
        labels = input_ids[:]  # In practice, apply masking strategy here

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long)  # Add labels
        }

# Function to train a WordPiece tokenizer
def train_tokenizer(file_directory, file_pattern):
    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    trainer = WordPieceTrainer(
        vocab_size=30522,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
    )
    files = glob(os.path.join(file_directory, file_pattern))
    tokenizer.train(files, trainer)
    return tokenizer

def flat_accuracy(preds, labels):
    """
    Function to calculate the accuracy of our predictions vs labels.
    It flattens both the predictions and labels arrays to compare them element-wise.
    """
    # Convert the highest logit to predicted label (argmax over the last dimension)
    pred_flat = np.argmax(preds, axis=2).flatten()
    
    # Flatten the true labels array
    labels_flat = labels.flatten()

    # Calculate the number of correct predictions
    correct_predictions = np.sum(pred_flat == labels_flat)

    # Calculate accuracy as the ratio of correct predictions to total predictions
    accuracy = correct_predictions / len(labels_flat)
    
    return accuracy

# Set file paths and directory
file_directory = "openwebtext/"
file_pattern = "urlsf_subset01-32*"
eval_file_pattern = 'urlsf_subset01-33*'
max_lengths = [128,256,512,768]

# Arrays to store runtime and memory usage
training_times = []
memory_usages = []
num_epochs = 30
for max_length in tqdm(max_lengths):
    print("Processing max_length: ", max_length)

    # Train the tokenizer
    tokenizer = train_tokenizer(file_directory, file_pattern)
    tokenizer.save(f"model/tokenizer_bert_standard_{max_length}.json")

    # Load the tokenizer
    tokenizer = Tokenizer.from_file(f"model/tokenizer_bert_standard_{max_length}.json")

    # Create the dataset
    dataset = TextFolderDataset(file_directory, file_pattern, tokenizer, max_length)
    eval_dataset = TextFolderDataset(file_directory, eval_file_pattern, tokenizer, max_length)

    # Create the DataLoader
    data_loader = DataLoader(dataset, batch_size=16, shuffle=True)
    eval_data_loader = DataLoader(eval_dataset, batch_size=16, shuffle=False)

    # Initialize the BERT config and model
    config = BertConfig(
        vocab_size=tokenizer.get_vocab_size(),
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        max_position_embeddings=max_length,
    )
    model = BertForMaskedLM(config).to(torch.device("cuda"))

    # Define loss and optimizer
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Start recording time and memory
    start_time = time.time()
    
    start_allocated, start_cached = get_gpu_memory_usage()
    print((start_allocated, start_cached))

    # Training loop
    model.train()
    for epoch in range(num_epochs):  # Use a smaller number of epochs for demonstration
        total_loss = 0
        for batch in data_loader:
            input_ids = batch['input_ids'].to(torch.device("cuda"))
            attention_mask = batch['attention_mask'].to(torch.device("cuda"))
            labels = input_ids.clone()
            print(f"Batch input_ids shape: {input_ids.shape}")
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
        
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        avg_loss = total_loss / input_ids.shape[0]
        print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
        if avg_loss < 0.5:
            break
            

    # End recording time and memory
    end_time = time.time()
    end_allocated, end_cached = get_gpu_memory_usage()

    # Store the runtime and memory usage
    delta_allocated = end_allocated - start_allocated
    delta_cached = end_cached - start_cached

    # Store the runtime and memory usage difference
    training_times.append(end_time - start_time)
    memory_usages.append(delta_cached)
    del model, optimizer, tokenizer, dataset,  eval_dataset, data_loader, eval_data_loader
    torch.cuda.empty_cache()  # Clear CUDA cache
    

# Plotting the graphs
plt.figure(figsize=(12, 6))

# Plot for Training Time
plt.subplot(1, 2, 1)
plt.plot(max_lengths, training_times, marker='o')
plt.title('Training Time vs Max Length')
plt.xlabel('Max Length')
plt.ylabel('Training Time (seconds)')

# Plot for Memory Usage
plt.subplot(1, 2, 2)
plt.plot(max_lengths, memory_usages, marker='o', color='red')
plt.title('Memory Usage vs Max Length')
plt.xlabel('Max Length')
plt.ylabel('Memory Usage (MB)')

plt.tight_layout()
plt.show()


  from .autonotebook import tqdm as notebook_tqdm
  0%|                                                                                             | 0/4 [00:00<?, ?it/s]

Processing max_length:  128







(417.22119140625, 472.0)
Batch input_ids shape: torch.Size([11, 128])
Epoch 1/30, Average Loss: 0.9496
Batch input_ids shape: torch.Size([11, 128])
Epoch 2/30, Average Loss: 0.8857
Batch input_ids shape: torch.Size([11, 128])
Epoch 3/30, Average Loss: 0.8506
Batch input_ids shape: torch.Size([11, 128])
Epoch 4/30, Average Loss: 0.8240
Batch input_ids shape: torch.Size([11, 128])
Epoch 5/30, Average Loss: 0.7936
Batch input_ids shape: torch.Size([11, 128])
Epoch 6/30, Average Loss: 0.7609
Batch input_ids shape: torch.Size([11, 128])
Epoch 7/30, Average Loss: 0.7314
Batch input_ids shape: torch.Size([11, 128])
Epoch 8/30, Average Loss: 0.7062
Batch input_ids shape: torch.Size([11, 128])
Epoch 9/30, Average Loss: 0.6819
Batch input_ids shape: torch.Size([11, 128])
Epoch 10/30, Average Loss: 0.6576
Batch input_ids shape: torch.Size([11, 128])
Epoch 11/30, Average Loss: 0.6339
Batch input_ids shape: torch.Size([11, 128])
Epoch 12/30, Average Loss: 0.6124
Batch input_ids shape: torch.Size([1

  0%|                                                                                             | 0/4 [01:26<?, ?it/s]


KeyboardInterrupt: 

In [3]:
print(training_times)

[84.79365921020508, 92.70790839195251, 101.47371196746826, 241.2532832622528]


In [4]:
print(memory_usages)

[820.0, 4396.0, 10388.0, 17138.0]


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from glob import glob
import os
from transformers import BertConfig, BertForMaskedLM, AdamW
import time
import numpy as np
from tqdm import tqdm
import psutil
import matplotlib.pyplot as plt

# Function to get current memory usage
torch.manual_seed(0)
def get_gpu_memory_usage():
    # Returns the current GPU memory usage in MB
    allocated = torch.cuda.memory_allocated() / (1024 * 1024)
    cached = torch.cuda.memory_reserved() / (1024 * 1024)
    return allocated, cached

# Define the text files dataset
# Define the text files dataset
class TextFolderDataset(Dataset):
    def __init__(self, file_directory, file_pattern, tokenizer, max_length):
        self.filepaths = glob(os.path.join(file_directory, file_pattern))
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        with open(self.filepaths[idx], 'r', encoding='utf-8') as file:
            text = file.read()
        encoding = self.tokenizer.encode(text)
        input_ids = encoding.ids[:self.max_length] + [0] * (self.max_length - len(encoding.ids[:self.max_length]))
        attention_mask = [1] * len(encoding.ids[:self.max_length]) + [0] * (self.max_length - len(encoding.ids[:self.max_length]))
        
        # Create labels (for masked language modeling, you might mask some tokens here)
        labels = input_ids[:]  # In practice, apply masking strategy here

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long)  # Add labels
        }

# Function to train a WordPiece tokenizer
def train_tokenizer(file_directory, file_pattern):
    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    trainer = WordPieceTrainer(
        vocab_size=30522,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
    )
    files = glob(os.path.join(file_directory, file_pattern))
    tokenizer.train(files, trainer)
    return tokenizer

def flat_accuracy(preds, labels):
    """
    Function to calculate the accuracy of our predictions vs labels.
    It flattens both the predictions and labels arrays to compare them element-wise.
    """
    # Convert the highest logit to predicted label (argmax over the last dimension)
    pred_flat = np.argmax(preds, axis=2).flatten()
    
    # Flatten the true labels array
    labels_flat = labels.flatten()

    # Calculate the number of correct predictions
    correct_predictions = np.sum(pred_flat == labels_flat)

    # Calculate accuracy as the ratio of correct predictions to total predictions
    accuracy = correct_predictions / len(labels_flat)
    
    return accuracy

# Set file paths and directory
file_directory = "openwebtext/"
file_pattern = "urlsf_subset01-32*"
eval_file_pattern = 'urlsf_subset01-33*'
max_lengths = [128, 256, 512, 768, 1024,1200,1300,1400,1900,2300,2800]

# Arrays to store runtime and memory usage
training_times = []
memory_usages = []

for max_length in tqdm(max_lengths):
    print("Processing max_length: ", max_length)

    # Train the tokenizer
    tokenizer = train_tokenizer(file_directory, file_pattern)
    tokenizer.save(f"model/tokenizer_bert_standard_{max_length}.json")

    # Load the tokenizer
    tokenizer = Tokenizer.from_file(f"model/tokenizer_bert_standard_{max_length}.json")

    # Create the dataset
    dataset = TextFolderDataset(file_directory, file_pattern, tokenizer, max_length)
    eval_dataset = TextFolderDataset(file_directory, eval_file_pattern, tokenizer, max_length)

    # Create the DataLoader
    data_loader = DataLoader(dataset, batch_size=16, shuffle=True)
    eval_data_loader = DataLoader(eval_dataset, batch_size=16, shuffle=False)

    # Initialize the BERT config and model
    config = BertConfig(
        vocab_size=tokenizer.get_vocab_size(),
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        max_position_embeddings=max_length,
    )
    model = BertForMaskedLM(config).to(torch.device("cuda"))

    # Define loss and optimizer
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Start recording time and memory
    start_time = time.time()
    
    start_allocated, start_cached = get_gpu_memory_usage()
    print((start_allocated, start_cached))

    # Training loop
    model.train()
    for epoch in range(5):  # Use a smaller number of epochs for demonstration
        total_loss = 0
        for batch in data_loader:
            input_ids = batch['input_ids'].to(torch.device("cuda"))
            attention_mask = batch['attention_mask'].to(torch.device("cuda"))
            labels = input_ids.clone()
            print(f"Batch input_ids shape: {input_ids.shape}")
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            

    # End recording time and memory
    end_time = time.time()
    end_allocated, end_cached = get_gpu_memory_usage()

    # Store the runtime and memory usage
    delta_allocated = end_allocated - start_allocated
    delta_cached = end_cached - start_cached

    # Store the runtime and memory usage difference
    training_times.append(end_time - start_time)
    memory_usages.append(delta_cached)
    del model, optimizer, tokenizer, dataset,  eval_dataset, data_loader, eval_data_loader
    torch.cuda.empty_cache()  # Clear CUDA cache
    avg_loss = total_loss / input_ids.shape[0]
    print(avg_loss)

# Plotting the graphs
plt.figure(figsize=(12, 6))

# Plot for Training Time
plt.subplot(1, 2, 1)
plt.plot(max_lengths, training_times, marker='o')
plt.title('Training Time vs Max Length')
plt.xlabel('Max Length')
plt.ylabel('Training Time (seconds)')

# Plot for Memory Usage
plt.subplot(1, 2, 2)
plt.plot(max_lengths, memory_usages, marker='o', color='red')
plt.title('Memory Usage vs Max Length')
plt.xlabel('Max Length')
plt.ylabel('Memory Usage (MB)')

plt.tight_layout()
plt.show()


  0%|                                                                                             | 0/1 [00:00<?, ?it/s]

Processing max_length:  1024







(419.85986328125, 472.0)
Batch input_ids shape: torch.Size([11, 1024])


  0%|                                                                                             | 0/1 [00:20<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 528.00 MiB. GPU 0 has a total capacty of 11.99 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Process 2960 has 17179869184.00 GiB memory in use. Of the allocated memory 18.75 GiB is allocated by PyTorch, and 135.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [2]:
print(training_times)

[22.092185020446777, 23.944892168045044, 27.08029341697693, 86.1953022480011]


In [5]:
print(memory_usages)

[35532.0, 38564.0, 42374.0]


In [2]:
class BertForMaskedLM(BertPreTrainedModel):
    _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]

    def __init__(self, config):
        super().__init__(config)

        if config.is_decoder:
            logger.warning(
                "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        self.bert = BertModel(config, add_pooling_layer=False)
        self.cls = BertOnlyMLMHead(config)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="'paris'",
        expected_loss=0.88,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # -100 index = padding token
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
        input_shape = input_ids.shape
        effective_batch_size = input_shape[0]

        #  add a dummy token
        if self.config.pad_token_id is None:
            raise ValueError("The PAD token should be defined for generation")

        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
        dummy_token = torch.full(
            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
        )
        input_ids = torch.cat([input_ids, dummy_token], dim=1)

        return {"input_ids": input_ids, "attention_mask": attention_mask}

NameError: name 'BertPreTrainedModel' is not defined

In [3]:
# from transformers import BertForMaskedLM
# import inspect

# print(inspect.getsource(BertForMaskedLM))