In [4]:
from transformers import RobertaTokenizer

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Read in the preprocessed text file with explicit encoding
with open('C:/langchain2/wiki/wiki_2/trext_data/processed_text.txt', 'r', encoding='utf-8') as f:
    text = f.read()


In [7]:
# split the text into smaller chunks
chunks = [text[i:i+100] for i in range(0, len(text), 100)]

In [8]:
num_chunks = len(chunks)
print("Number of chunks created:", num_chunks)

Number of chunks created: 2064


In [9]:
from transformers import AutoTokenizer
import torch 

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [10]:
# create features for each chunk
features = []
for chunk in chunks:
    encoding = tokenizer.encode_plus(
        chunk,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors='pt'
    )
    features.append(encoding)

# stack the features
input_ids = torch.cat([f['input_ids'] for f in features], dim=0)
attention_mask = torch.cat([f['attention_mask'] for f in features], dim=0)
token_type_ids = torch.cat([f['token_type_ids'] for f in features], dim=0)


In [11]:
input_ids

tensor([[    0,  1658, 49314,  ...,     1,     1,     1],
        [    0,  2780,  6626,  ...,     1,     1,     1],
        [    0,    75,   289,  ...,     1,     1,     1],
        ...,
        [    0,    91,     5,  ...,     1,     1,     1],
        [    0,    96,     5,  ...,     1,     1,     1],
        [    0,  4820,   162,  ...,     1,     1,     1]])

In [12]:
token_type_ids

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

In [13]:
encodings = {
    'input_ids': input_ids,
    'attention_mask': attention_mask,
}

In [14]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    def __getitem__(self,i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [15]:
dataset = Dataset(encodings)

In [16]:
dataset

<__main__.Dataset at 0x2a6f0037c40>

In [17]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [18]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x2a6f0037be0>

In [19]:
from transformers import AutoModelForMaskedLM

In [20]:
device = torch.device('cuda')

In [21]:
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")
model.to(device)

XLMRobertaForMaskedLM(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
  

In [22]:
from transformers import AdamW
from tqdm.auto import tqdm


In [23]:
optim = AdamW(model.parameters(), lr=1e-5)



In [24]:


batch_size = 1
epoch = 4
loop = tqdm(dataloader, leave=True)
for i, batch in enumerate(loop):
    if i % batch_size == 0:
        optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    if (i + 1) % batch_size == 0:
        optim.step()
        loop.set_description(f'Epoch:{epoch}')
        loop.set_description(f'loss:{loss.item()}')

    # Free up GPU memory after every batch
    del input_ids, attention_mask, labels, outputs
    torch.cuda.empty_cache()


  0%|          | 0/129 [00:00<?, ?it/s]


KeyError: 'labels'

In [None]:
model.save_pretrained('C:/langchain2/wiki/wiki_2/trext_data/roberta_model_1')

In [2]:
# from transformers import RobertaTokenizer

# # Load the tokenizer
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# # Read in the preprocessed text file with explicit encoding
# with open('C:/langchain2/wiki/wiki_2/trext_data/processed_text.txt', 'r', encoding='utf-8') as f:
#     text = f.read()

# # Tokenize the text using the tokenizer
# encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')


In [3]:
# encoded_input

{'input_ids': tensor([[    0,   611,   967,  1545,   879,  2636, 27802, 37771,  1855,   967,
          1545,   879,  2636,  6793,    36,  1855,   967,   705,  4839,   479,
         28667, 48079, 11696,  2660,  2400,   479,   842, 33488,   636,  5948,
            80, 11901,   523,   705,   183,  4553,   366,   710,   479, 28667,
           189, 48079,   471,  1488,  2156, 11721,  3998,  2400,  2156,  2660,
         27435,  2156, 21563,   479, 28667,  4505, 32092,   624,   186, 25606,
           141,  3623,  2156,  5852,  2660,  2400,   189,    94,   353,    76,
           479,   810,   744,   198,   112,     6,   151,     4,   664,  2156,
           793,  2156,   474,   936,   810, 20242, 40177,   281,   479,  6793,
          2504,  3723, 25934,    80,  1907, 22443,  4832,    10,   196,  1076,
           428,  1517, 11726,   687,    10,   196,    10,   242, 44724,   118,
           479,  1049, 10970,   183,   479,  6793,   189, 27884,   922,   624,
           346, 23727, 48079,  5103, 3