In [44]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import pandas as pd
import torch
import os
import warnings

In [45]:
import torch
torch.cuda.empty_cache()


In [46]:
# Suppress warnings
warnings.simplefilter("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [47]:
# Define the Dataset class
class PretrainingDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        tokenized = self.tokenizer(
            text=text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt",
        )
        return {
            "input_ids": tokenized["input_ids"].squeeze(),
            "attention_mask": tokenized["attention_mask"].squeeze(),
            "labels": tokenized["input_ids"].squeeze(),
        }


In [48]:
# Load data
DATA_PATH = "train.csv"
data = pd.read_csv(DATA_PATH)
#texts = data["full_text"].values
texts = data["full_text"].values[:500]  # Use first 500 samples


In [49]:
# Load model and tokenizer
'''
MODEL_NAME_OR_PATH = "microsoft/deberta-v3-base"
'''
MODEL_NAME_OR_PATH = "distilbert-base-uncased"
MAX_LENGTH = 512


In [50]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME_OR_PATH)

In [51]:
# Create dataset
dataset = PretrainingDataset(
    texts=texts,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
)


In [52]:
# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

In [53]:
training_args = TrainingArguments(
    output_dir="./results",
    run_name="masked_lm_experiment",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    report_to="none",  # Disable W&B
)


In [54]:
from sklearn.model_selection import train_test_split

# Split the dataset
train_texts, eval_texts = train_test_split(texts, test_size=0.2, random_state=42)

# Create evaluation dataset
eval_dataset = PretrainingDataset(
    texts=eval_texts,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
)

# Initialize Trainer with eval_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=eval_dataset,  # Add evaluation dataset
    data_collator=data_collator,
)


In [55]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [56]:
# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

Epoch,Training Loss,Validation Loss
1,No log,2.324391
2,No log,2.266984
3,No log,2.161151


{'eval_loss': 2.156226396560669,
 'eval_runtime': 2.2277,
 'eval_samples_per_second': 44.89,
 'eval_steps_per_second': 5.836,
 'epoch': 3.0}

In [58]:
# Check if CUDA is available and move model to GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0

In [66]:
# Example input text with a masked token
input_text = "I am a south [MASK]"

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt")

# Move the inputs to the same device as the model (GPU or CPU)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Get the model's prediction
outputs = model(**inputs)
logits = outputs.logits

# Find the index of the [MASK] token
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

# Get the top 5 predictions for the [MASK] token
predicted_ids = torch.topk(logits[0, mask_token_index], k=5, dim=-1).indices

# Decode and print the top predictions
for idx, token_id in enumerate(predicted_ids[0]):
    predicted_word = tokenizer.decode(token_id)
    print(f"Prediction {idx + 1}: {predicted_word}")


Prediction 1: .
Prediction 2: ;
Prediction 3: !
Prediction 4: african
Prediction 5: american


In [67]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add a padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Resize the model's embedding layer to accommodate the new token
model.resize_token_embeddings(len(tokenizer))

# Define custom text inputs
custom_texts = [ "NLP is based on transformers."]

# Tokenize the custom texts and return attention mask
custom_encodings = tokenizer(custom_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="pt", return_attention_mask=True)

# Explicitly set pad_token_id
pad_token_id = tokenizer.pad_token_id

generated_text = model.generate(
    input_ids=custom_encodings['input_ids'],
    attention_mask=custom_encodings['attention_mask'],
    max_length=50,
    pad_token_id=pad_token_id,
    top_p=0.9,  # Nucleus sampling (probability threshold)
    temperature=0.7,  # Control randomness
    no_repeat_ngram_size=2  # Avoid repeating n-grams
)



In [68]:

decoded_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)
print(f"Generated Text: {decoded_text}")


Generated Text: NLP is based on transformers.

The first step is to create a transformer. The first transformer is a simple one. It is used to transform a single value into a number. For example, if we have a value of 1,


In [69]:
from torch.nn.functional import softmax

# Define a custom text with a partial sentence
partial_sentence = "I had breakfast this morning and i loved the way it"

# Tokenize the partial sentence
input_ids = tokenizer(partial_sentence, return_tensors="pt")["input_ids"]

# Get the model output logits
outputs = model(input_ids=input_ids)
logits = outputs.logits

# Extract the logits for the last token
last_token_logits = logits[0, -1, :]

# Apply softmax to get probabilities
probs = softmax(last_token_logits, dim=-1)

# Get the top 10 possible next words
top_k = 10
top_k_indices = torch.topk(probs, top_k).indices
top_k_words = [tokenizer.decode([idx]) for idx in top_k_indices]

# Print the results
print("Top possible next words:")
for word, prob in zip(top_k_words, probs[top_k_indices]):
    print(f"{word}: {prob.item():.4f}")


Top possible next words:
 looked: 0.1540
 was: 0.1375
 turned: 0.1069
 tasted: 0.0813
 smelled: 0.0789
 cooked: 0.0390
 came: 0.0249
 made: 0.0224
 reminded: 0.0174
 smells: 0.0108
