In [1]:
import torch
from transformers import RobertaTokenizer, RobertaForCausalLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.utils.data import Dataset

# example code snippets
X = [
    "def max(a, b): if a > b: return a else return b",
    "def factorial(n): if n == 0: return 1 else: return n * factorial(n-1)",
    "def fibonacci(n): if n <= 1: return n else: return fibonacci(n-1) + fibonacci(n-2)",
    "def is_palindrome(s): return s == s[::-1]",
    "def bubble_sort(arr): for i in range(len(arr)): for j in range(0, len(arr)-i-1): if arr[j] > arr[j+1]: arr[j], arr[j+1] = arr[j+1], arr[j]",
    "def is_prime(n): if n <= 1: return False for i in range(2, int(n**0.5)+1): if n % i == 0: return False return True",
    "def reverse_list(lst): return lst[::-1]",
    "def gcd(a, b): while b: a, b = b, a % b return a",
    "def binary_search(arr, x): l, r = 0, len(arr)-1 while l <= r: mid = (l + r) // 2 if arr[mid] == x: return mid elif arr[mid] < x: l = mid + 1 else: r = mid - 1 return -1",
    "def sum_list(lst): return sum(lst)",
    "def celsius_to_fahrenheit(c): return (c * 9/5) + 32"
]
y_true = [
    """```mermaid
    graph TD
        A[max] --> B{a > b}
        B -->|Yes| C[Return a]
        B -->|No| D[Return b]
    ```""",
    """```mermaid
    graph TD
        A[factorial n] --> B{n == 0}
        B --> |Yes| C[return 1]
        B --> |No| D[return n * m]
        D --> |m = factorial n-1| A
    ```""",
    """```mermaid
    graph TD
        A[fibonacci n] --> B{n <= 1}
        B --> |Yes| C[return n]
        B --> |No| D[return i + j]
        D --> |i = fibonacci n-1| A
        D --> |j = fibonacci n-2| A
    ```""",
    """```mermaid
    graph TD
        A[is_palindrome s] --> B{"s == s[::-1]"}
        B --> |Yes| C[return True]
        B --> |No| D[return False]
    ```""",
    """```mermaid
    graph TD
        A[bubble_sort arr] --> B["loop i from 0 to len(arr)-1"]
        B --> C["loop j from 0 to len(arr)-i-2"]
        C --> |End| B
        C --> D{"arr[j] > arr[j+1]"}
        D --> |Yes| E["swap arr[j] and arr[j+1]"]
        D --> |No| F[continue]
        E --> F
        F --> C
        B --> |End| I[return arr]
    ```""",
    """```mermaid
    graph TD
        A[is_prime n] --> B{n <= 1}
        B --> |Yes| C[return False]
        B --> |No| D["loop i from 2 to int(n**0.5)+1"]
        D --> E{"n  % i == 0"}
        E --> |Yes| C
        E --> |No| G[continue]
        G --> D
        D --> |End| F[return True]
    ```""",
    """```mermaid
    graph TD
        A[reverse_list lst] --> B["return lst[::-1]"]
    ```""",
    """```mermaid
    graph TD
        A[gcd a b] --> B[while b]
        B --> C[a, b = b, a % b]
        C --> B
        B --> |End| D[return a]
    ```""",
    """```mermaid
    graph TD
        A[biary_search arr x] --> B["l, r = 0, len(arr)-1"]
        B --> C[while l <= r]
        C --> |End| D[return -1]
        C --> E["mid = (l + r) // 2"]
        E --> F{"arr[mid] == x"}
        F --> |Yes| G[return mid]
        F --> |No| H{"arr[mid] < x"}
        H --> |Yes| I[l = mid + 1]
        H --> |No| J[r = mid - 1]
        J & I --> K[continue]
        K --> C
    ```""",
    """```mermaid
    graph TD
        A[sum_list lst] --> B["return sum(lst)"]
    ```""",
    """```mermaid
    graph TD
        A[celsius_to_fahrenheit c] --> B["return (c * 9/5) + 32"]
    ```"""
]

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenize the input and target
def tokenize_function(source, target):
    source_encodings = tokenizer(source, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
    target_encodings = tokenizer(target, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
    return source_encodings, target_encodings

source_encodings, target_encodings = tokenize_function(X, y_true)

# Device setup
cuda_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

# Load the model
model = RobertaForCausalLM.from_pretrained("roberta-base", is_decoder=True).to(device)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    logging_steps=1,
    learning_rate=2e-2,
    per_device_train_batch_size=11,
    num_train_epochs=200,
    weight_decay=0.01,
    predict_with_generate=True
)

# Prepare the dataset in a format compatible with Trainer
class CodeDataset(Dataset):
    def __init__(self, source_encodings, target_encodings):
        self.source_encodings = source_encodings
        self.target_encodings = target_encodings

    def __len__(self):
        return len(self.source_encodings.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.source_encodings.input_ids[idx].to(device),
            'attention_mask': self.source_encodings.attention_mask[idx].to(device),
            'labels': self.target_encodings.input_ids[idx].to(device)
        }

  cpu = _conversion_method_template(device=torch.device("cpu"))
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Create dataset
dataset = CodeDataset(source_encodings, target_encodings)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# Fine-tune the model
trainer.train()

# New code snippet to translate
new_code = "def max(a, b): if a > b: return a else return b"

# Tokenize the new input
new_input = tokenizer(new_code, return_tensors="pt").to(device)

_ = model.to(device)

In [None]:
# Generate the translated code
generated_tokens = model.generate(
    input_ids=new_input['input_ids'],
    attention_mask=new_input['attention_mask'],
    max_length=256,
    num_return_sequences=1,
)

# Decode the generated tokens to text
generated_code = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

print(generated_code)

In [None]:
# New code snippet to translate
new_code = "def factorial(n): if n == 0: return 1 else: return n * factorial(n-1)"

# Tokenize the new input
new_input = tokenizer(new_code, return_tensors="pt").to(device)

generated_tokens = model.generate(
    input_ids=new_input['input_ids'],
    attention_mask=new_input['attention_mask'],
    max_length=256,
    num_return_sequences=1,
)

generated_code = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
generated_code

---

In [31]:
from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
import torch

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
config.is_decoder = True
model = RobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)

inputs = tokenizer("Hello, my dog is cute and", return_tensors="pt")
outputs = model.generate(**inputs, max_length=256)
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Hello, my dog is cute and'

In [27]:
model.prepare_inputs_for_generation(inputs.input_ids)

{'input_ids': tensor([[    0, 31414,     6,   127,  2335,    16, 11962,     8,     2]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'past_key_values': None}

In [30]:
model.generate(inputs.input_ids, max_length=256)

tensor([[    0, 31414,     6,   127,  2335,    16, 11962,     8,     2,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [32]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the pre-trained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Encode the input text
inputs = tokenizer("Hello, my dog is cute and", return_tensors="pt")

# Generate text
outputs = model.generate(**inputs, max_length=256, num_return_sequences=1)

# Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_text)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Hello, my dog is cute and I love him. I'm not sure if he's a good dog, but I'm sure he's a good dog. I'm not sure if he's a good dog, but I'm sure he's a good dog.

I'm not sure if he's a good dog, but I'm sure he's a good dog. I'm not sure if he's a good dog, but I'm sure he's a good dog.

I'm not sure if he's a good dog, but I'm sure he's a good dog. I'm not sure if he's a good dog, but I'm sure he's a good dog.

I'm not sure if he's a good dog, but I'm sure he's a good dog. I'm not sure if he's a good dog, but I'm sure he's a good dog.

I'm not sure if he's a good dog, but I'm sure he's a good dog. I'm not sure if he's a good dog, but I'm sure he's a good dog.

I'm not sure if he's a good dog, but I'm sure he's a good dog. I'm not sure


---

In [1]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset

X = [
    "def max(a, b): if a > b: return a else return b",
    "def factorial(n): if n == 0: return 1 else: return n * factorial(n-1)",
    "def fibonacci(n): if n <= 1: return n else: return fibonacci(n-1) + fibonacci(n-2)",
    "def is_palindrome(s): return s == s[::-1]",
    "def bubble_sort(arr): for i in range(len(arr)): for j in range(0, len(arr)-i-1): if arr[j] > arr[j+1]: arr[j], arr[j+1] = arr[j+1], arr[j]",
    "def is_prime(n): if n <= 1: return False for i in range(2, int(n**0.5)+1): if n % i == 0: return False return True",
    "def reverse_list(lst): return lst[::-1]",
    "def gcd(a, b): while b: a, b = b, a % b return a",
    "def binary_search(arr, x): l, r = 0, len(arr)-1 while l <= r: mid = (l + r) // 2 if arr[mid] == x: return mid elif arr[mid] < x: l = mid + 1 else: r = mid - 1 return -1",
    "def sum_list(lst): return sum(lst)",
    "def celsius_to_fahrenheit(c): return (c * 9/5) + 32"
]
y_true = [
    """```mermaid
    graph TD
        A[max] --> B{a > b}
        B -->|Yes| C[Return a]
        B -->|No| D[Return b]
    ```""",
    """```mermaid
    graph TD
        A[factorial n] --> B{n == 0}
        B --> |Yes| C[return 1]
        B --> |No| D[return n * m]
        D --> |m = factorial n-1| A
    ```""",
    """```mermaid
    graph TD
        A[fibonacci n] --> B{n <= 1}
        B --> |Yes| C[return n]
        B --> |No| D[return i + j]
        D --> |i = fibonacci n-1| A
        D --> |j = fibonacci n-2| A
    ```""",
    """```mermaid
    graph TD
        A[is_palindrome s] --> B{"s == s[::-1]"}
        B --> |Yes| C[return True]
        B --> |No| D[return False]
    ```""",
    """```mermaid
    graph TD
        A[bubble_sort arr] --> B["loop i from 0 to len(arr)-1"]
        B --> C["loop j from 0 to len(arr)-i-2"]
        C --> |End| B
        C --> D{"arr[j] > arr[j+1]"}
        D --> |Yes| E["swap arr[j] and arr[j+1]"]
        D --> |No| F[continue]
        E --> F
        F --> C
        B --> |End| I[return arr]
    ```""",
    """```mermaid
    graph TD
        A[is_prime n] --> B{n <= 1}
        B --> |Yes| C[return False]
        B --> |No| D["loop i from 2 to int(n**0.5)+1"]
        D --> E{"n  % i == 0"}
        E --> |Yes| C
        E --> |No| G[continue]
        G --> D
        D --> |End| F[return True]
    ```""",
    """```mermaid
    graph TD
        A[reverse_list lst] --> B["return lst[::-1]"]
    ```""",
    """```mermaid
    graph TD
        A[gcd a b] --> B[while b]
        B --> C[a, b = b, a % b]
        C --> B
        B --> |End| D[return a]
    ```""",
    """```mermaid
    graph TD
        A[biary_search arr x] --> B["l, r = 0, len(arr)-1"]
        B --> C[while l <= r]
        C --> |End| D[return -1]
        C --> E["mid = (l + r) // 2"]
        E --> F{"arr[mid] == x"}
        F --> |Yes| G[return mid]
        F --> |No| H{"arr[mid] < x"}
        H --> |Yes| I[l = mid + 1]
        H --> |No| J[r = mid - 1]
        J & I --> K[continue]
        K --> C
    ```""",
    """```mermaid
    graph TD
        A[sum_list lst] --> B["return sum(lst)"]
    ```""",
    """```mermaid
    graph TD
        A[celsius_to_fahrenheit c] --> B["return (c * 9/5) + 32"]
    ```"""
]
device = torch.device("cpu")


  cpu = _conversion_method_template(device=torch.device("cpu"))
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# !pip install sentencepiece
# Load a pre-trained T5 model
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

# Tokenize the input and target
def tokenize_function(source, target):
    prompt = [f"translate python to mermaid: {s}" for s in source]
    source_encodings = tokenizer(prompt, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    target_encodings = tokenizer(target, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    return source_encodings, target_encodings

source_encodings, target_encodings = tokenize_function(X, y_true)


class CodeDataset(Dataset):
    def __init__(self, source_encodings, target_encodings):
        self.source_encodings = source_encodings
        self.target_encodings = target_encodings

    def __len__(self):
        return len(self.source_encodings.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.source_encodings.input_ids[idx].to(device),
            'attention_mask': self.source_encodings.attention_mask[idx].to(device),
            'labels': self.target_encodings.input_ids[idx].to(device)
        }

dataset = CodeDataset(source_encodings, target_encodings)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    logging_steps=1,
    learning_rate=2e-5,
    per_device_train_batch_size=11,
    num_train_epochs=100,
    weight_decay=0.01,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# Example Python code input
code_input = "def max(a, b): if a > b: return a else return b"

# Construct the prompt
prompt = f"translate python to mermaid: {code_input}"

# Tokenize the input
inputs = tokenizer(prompt, return_tensors="pt")

model.to(device)

# Generate the output using T5
outputs = model.generate(inputs.input_ids, max_length=256, num_beams=5, early_stopping=False)

# Decode the output to text
mermaid_output = tokenizer.decode(outputs[0])

# Print the generated Mermaid syntax
print("Generated Mermaid syntax:")
print(mermaid_output)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Step,Training Loss
1,8.6065
2,8.4411
3,7.6189
4,7.6572
5,7.2518
6,6.9463
7,5.9566
8,5.9282
9,5.7807
10,5.6961


Generated Mermaid syntax:
<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad> <unk> mermaid,<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad> <extra_id_0> <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad> <extra_id_0> <pad><pad><pad><pad><