#### 1. Install all requirements

In [None]:
!pip install transformers
!pip install torch
!pip install scikit-learn
!pip install tensorflow
!pip install tqdm

#### 2. Get data from open-source library TensorFlow

Firstly, we need to download the data

In [4]:
import inspect
import tensorflow

functions = inspect.getmembers(tensorflow, inspect.isfunction)
method_names = [f[0] for f in functions]
code_snippets = [inspect.getsource(f[1]) for f in functions]

print('method name:', method_names[0])
print('method code:\n', code_snippets[0])

method name:  Assert
method code:
 @tf_export("debugging.Assert", "Assert")
@dispatch.add_dispatch_support
@tf_should_use.should_use_result
def Assert(condition, data, summarize=None, name=None):
  """Asserts that the given condition is true.

  If `condition` evaluates to false, print the list of tensors in `data`.
  `summarize` determines how many entries of the tensors to print.

  Args:
    condition: The condition to evaluate.
    data: The tensors to print out when condition is false.
    summarize: Print this many entries of each tensor.
    name: A name for this operation (optional).

  Returns:
    assert_op: An `Operation` that, when executed, raises a
    `tf.errors.InvalidArgumentError` if `condition` is not true.
    @compatibility(eager)
    returns None
    @end_compatibility

  Raises:
    @compatibility(TF1)
    When in TF V1 mode (that is, outside `tf.function`) Assert needs a control
    dependency on the output to ensure the assertion executes:

  ```python
  # Ensu

Split the data into training and validation sets

In [5]:
from sklearn.model_selection import train_test_split

train_code, eval_code, train_methods, eval_methods = train_test_split(
    code_snippets, method_names, test_size=0.2, random_state=42
)

Class to represent the data

In [6]:
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from typing import Dict, List, Optional
import torch

class CodeDataset(Dataset):
    def __init__(self, code_snippets: List[str], method_names: List[str], tokenizer: AutoTokenizer, max_length: int):
        self.inputs = code_snippets
        self.targets = method_names
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        input = self.inputs[idx]
        target = self.targets[idx]

        input_ids = self.tokenizer(
            input,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        ).input_ids.squeeze()

        target_ids = self.tokenizer(
            target,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        ).input_ids.squeeze()

        return {
            "input_ids": input_ids,
            "labels": target_ids,
        }

#### 3. Set up and fine-tune the model

In [7]:
from torch import nn

def eval_model(model: nn.Module, eval_loader: DataLoader, device: torch.device) -> float:
    model.eval()
    total_loss = 0.0
    for batch in eval_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, labels=labels)
            loss = outputs[0]
            total_loss += loss.item()

    return total_loss / len(eval_loader)

In [14]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
from tqdm import tqdm

checkpoint = "Salesforce/codet5p-220m"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint).to(device)

train_dataset = CodeDataset(train_code, train_methods, tokenizer)
eval_dataset = CodeDataset(eval_code, eval_methods, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

optim = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Let's see how our model performs before training
eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=True)
eval_loss = eval_model(model, eval_loader, device)
print(f'loss before fine-tuning: {eval_loss}')

for epoch in range(1, 30):
    model.train()
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
    eval_loss = eval_model(model, eval_loader, device)
    print(f"epoch: {epoch}, loss: {eval_loss}")



loss before fine-tuning: 13.898907279968261


100%|██████████| 20/20 [00:09<00:00,  2.07it/s]


epoch: 1, loss: 5.395035743713379


100%|██████████| 20/20 [00:09<00:00,  2.04it/s]


epoch: 2, loss: 4.520201873779297


100%|██████████| 20/20 [00:09<00:00,  2.02it/s]


epoch: 3, loss: 3.935852289199829


100%|██████████| 20/20 [00:10<00:00,  2.00it/s]


epoch: 4, loss: 3.281422805786133


100%|██████████| 20/20 [00:09<00:00,  2.00it/s]


epoch: 5, loss: 2.5783191680908204


100%|██████████| 20/20 [00:09<00:00,  2.02it/s]


epoch: 6, loss: 1.8931793928146363


100%|██████████| 20/20 [00:09<00:00,  2.03it/s]


epoch: 7, loss: 1.2813795566558839


100%|██████████| 20/20 [00:09<00:00,  2.04it/s]


epoch: 8, loss: 0.819016432762146


100%|██████████| 20/20 [00:09<00:00,  2.04it/s]


epoch: 9, loss: 0.516479742527008


100%|██████████| 20/20 [00:09<00:00,  2.03it/s]


epoch: 10, loss: 0.342057341337204


100%|██████████| 20/20 [00:09<00:00,  2.02it/s]


epoch: 11, loss: 0.243259996175766


100%|██████████| 20/20 [00:09<00:00,  2.02it/s]


epoch: 12, loss: 0.18193749189376832


100%|██████████| 20/20 [00:09<00:00,  2.02it/s]


epoch: 13, loss: 0.14347895979881287


100%|██████████| 20/20 [00:09<00:00,  2.02it/s]


epoch: 14, loss: 0.11736719012260437


100%|██████████| 20/20 [00:09<00:00,  2.03it/s]


epoch: 15, loss: 0.09817970842123032


100%|██████████| 20/20 [00:09<00:00,  2.03it/s]


epoch: 16, loss: 0.08371521383523942


100%|██████████| 20/20 [00:09<00:00,  2.02it/s]


epoch: 17, loss: 0.07290228754281998


100%|██████████| 20/20 [00:09<00:00,  2.02it/s]


epoch: 18, loss: 0.06206633150577545


100%|██████████| 20/20 [00:09<00:00,  2.02it/s]


epoch: 19, loss: 0.06047484278678894


100%|██████████| 20/20 [00:09<00:00,  2.02it/s]


epoch: 20, loss: 0.05937379896640778


100%|██████████| 20/20 [00:09<00:00,  2.03it/s]


epoch: 21, loss: 0.05085800513625145


100%|██████████| 20/20 [00:09<00:00,  2.03it/s]


epoch: 22, loss: 0.04579186961054802


100%|██████████| 20/20 [00:09<00:00,  2.03it/s]


epoch: 23, loss: 0.03767436221241951


100%|██████████| 20/20 [00:09<00:00,  2.04it/s]


epoch: 24, loss: 0.0333365224301815


100%|██████████| 20/20 [00:09<00:00,  2.03it/s]


epoch: 25, loss: 0.031221393123269083


100%|██████████| 20/20 [00:09<00:00,  2.02it/s]


epoch: 26, loss: 0.03253896273672581


100%|██████████| 20/20 [00:09<00:00,  2.02it/s]


epoch: 27, loss: 0.026241499185562133


100%|██████████| 20/20 [00:09<00:00,  2.02it/s]


epoch: 28, loss: 0.02415339685976505


100%|██████████| 20/20 [00:09<00:00,  2.03it/s]


epoch: 29, loss: 0.022435473650693892
