In [1]:
import torch
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import logging
import time
import utils
from trainingArgumentsWithMPSSupport import TrainingArgumentsWithMPSSupport

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
device = utils.getting_device()
logger.info(f"Using device: {device}")

INFO:__main__:Using device: mps


device: mps


In [11]:
df = pd.read_csv('data/rosettaCodeByLine.csv')[['code']]
df.shape

(2935651, 1)

In [13]:
df = df.dropna()
df.shape
df.to_csv("data/rosettaCodeByLine.csv")  

In [14]:
df.shape

(2935634, 1)

In [12]:
df = df.dropna()
df.shape
df.to_csv("data/rosettaCodeByLine.csv")  

(2935634, 1)

In [15]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base", max_length=512)
model = RobertaForMaskedLM.from_pretrained("roberta-base").to(device)

In [16]:
def tokenize_dataset(df, tokenizer, device):
    """
    Tokenizes the 'code' column in the DataFrame using a Roberta tokenizer.
    Returns a new Dataset object with the tokenized 'code' column.
    """
    new_dataset = Dataset.from_pandas(df)
    
    tokenized_dataset = new_dataset.map(lambda x: tokenizer(x['code'], truncation=True, padding=True, max_length=512), 
            num_proc=16)
    return tokenized_dataset

In [17]:
tokenized_dataset = tokenize_dataset(df, tokenizer, device)

Map (num_proc=16): 100%|██████████| 2935634/2935634 [00:45<00:00, 64908.04 examples/s] 


In [9]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Define training arguments
training_args = TrainingArguments(
                output_dir = "./co-roberta",
                overwrite_output_dir=True,
                num_train_epochs=1,
                per_device_train_batch_size=32,
                save_steps=10000,
                save_total_limit=2,
                )


In [10]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)