In [54]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import pandas as pd
import torch
import os
import warnings

In [55]:
import torch
torch.cuda.empty_cache()


In [56]:
# Suppress warnings
warnings.simplefilter("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [57]:
# Define the Dataset class
class PretrainingDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        tokenized = self.tokenizer(
            text=text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt",
        )
        return {
            "input_ids": tokenized["input_ids"].squeeze(),
            "attention_mask": tokenized["attention_mask"].squeeze(),
            "labels": tokenized["input_ids"].squeeze(),
        }


In [58]:
# Load data
DATA_PATH = "train.csv"
data = pd.read_csv(DATA_PATH)
#texts = data["full_text"].values
texts = data["full_text"].values[:500]  # Use first 500 samples


In [59]:
# Load model and tokenizer
'''
MODEL_NAME_OR_PATH = "microsoft/deberta-v3-base"
'''
MODEL_NAME_OR_PATH = "distilbert-base-uncased"
MAX_LENGTH = 512


In [60]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME_OR_PATH)

In [61]:
# Create dataset
dataset = PretrainingDataset(
    texts=texts,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
)


In [62]:
# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

In [63]:
training_args = TrainingArguments(
    output_dir="./results",
    run_name="masked_lm_experiment",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    report_to="none",  # Disable W&B
)


In [64]:
from sklearn.model_selection import train_test_split

# Split the dataset
train_texts, eval_texts = train_test_split(texts, test_size=0.2, random_state=42)

# Create evaluation dataset
eval_dataset = PretrainingDataset(
    texts=eval_texts,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
)

# Initialize Trainer with eval_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=eval_dataset,  # Add evaluation dataset
    data_collator=data_collator,
)


In [65]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [66]:
# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

Epoch,Training Loss,Validation Loss
1,No log,2.323203
2,No log,2.270113
3,No log,2.159868


{'eval_loss': 2.155928134918213,
 'eval_runtime': 0.8257,
 'eval_samples_per_second': 121.111,
 'eval_steps_per_second': 15.744,
 'epoch': 3.0}

In [69]:
custom_encodings = tokenizer(custom_texts, truncation=True, padding=True, max_length=MAX_LENGTH)


In [70]:
class CustomTextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

custom_dataset = CustomTextDataset(custom_encodings)


In [71]:
model.eval()  # Set the model to evaluation mode

custom_loader = torch.utils.data.DataLoader(custom_dataset, batch_size=1)

with torch.no_grad():  # Disable gradient calculations for inference
    for batch in custom_loader:
        outputs = model(**batch)
        predictions = outputs.logits  # For classification, this will be logits
        predicted_class = torch.argmax(predictions, dim=-1)  # Get the predicted class index
        print(f"Prediction: {predicted_class.item()}")


Prediction: 1
Prediction: 1


In [84]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add a padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Resize the model's embedding layer to accommodate the new token
model.resize_token_embeddings(len(tokenizer))

# Define custom text inputs
custom_texts = [ "NLP is based on transformers."]

# Tokenize the custom texts and return attention mask
custom_encodings = tokenizer(custom_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="pt", return_attention_mask=True)

# Explicitly set pad_token_id
pad_token_id = tokenizer.pad_token_id

generated_text = model.generate(
    input_ids=custom_encodings['input_ids'],
    attention_mask=custom_encodings['attention_mask'],
    max_length=50,
    pad_token_id=pad_token_id,
    top_p=0.9,  # Nucleus sampling (probability threshold)
    temperature=0.7,  # Control randomness
    no_repeat_ngram_size=2  # Avoid repeating n-grams
)



In [85]:

decoded_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)
print(f"Generated Text: {decoded_text}")


Generated Text: NLP is based on transformers.

The first step is to create a transformer. The first transformer is a simple one. It is used to transform a single value into a number. For example, if we have a value of 1,
