#### 1. Install all the requirements

In [6]:
!pip install transformers
!pip install torch
!pip install scikit-learn
!pip install inspect
!pip install tensorflow
!pip install tqdm

[31mERROR: Could not find a version that satisfies the requirement inspect[0m
[31mERROR: No matching distribution found for inspect[0m
^C
[31mERROR: Operation cancelled by user[0m


#### 2. Get the data from open-source library TensorFlow

Firstly, we need to download the data

In [7]:
import inspect
import tensorflow

functions = inspect.getmembers(tensorflow, inspect.isfunction)
method_names = [f[0] for f in functions]
code_snippets = [inspect.getsource(f[1]) for f in functions]

print(method_names[0])
print(code_snippets[0])

Assert
@tf_export("debugging.Assert", "Assert")
@dispatch.add_dispatch_support
@tf_should_use.should_use_result
def Assert(condition, data, summarize=None, name=None):
  """Asserts that the given condition is true.

  If `condition` evaluates to false, print the list of tensors in `data`.
  `summarize` determines how many entries of the tensors to print.

  Args:
    condition: The condition to evaluate.
    data: The tensors to print out when condition is false.
    summarize: Print this many entries of each tensor.
    name: A name for this operation (optional).

  Returns:
    assert_op: An `Operation` that, when executed, raises a
    `tf.errors.InvalidArgumentError` if `condition` is not true.
    @compatibility(eager)
    returns None
    @end_compatibility

  Raises:
    @compatibility(TF1)
    When in TF V1 mode (that is, outside `tf.function`) Assert needs a control
    dependency on the output to ensure the assertion executes:

  ```python
  # Ensure maximum element of x is s

Split the data into training and validation sets

In [8]:
from sklearn.model_selection import train_test_split

train_code, eval_code, train_methods, eval_methods = train_test_split(
    code_snippets, method_names, test_size=0.2, random_state=42
)

Class to represent the data

In [9]:
from torch.utils.data import DataLoader, Dataset
import torch

class CodeDataset(Dataset):
    def __init__(self, code_snippets, method_names, tokenizer, max_length=128):
        self.inputs = code_snippets
        self.targets = method_names
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input = self.inputs[idx]
        target = self.targets[idx]

        input_ids = self.tokenizer(
            input,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        ).input_ids.squeeze()

        target_ids = self.tokenizer(
            target,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        ).input_ids.squeeze()

        return {
            "input_ids": input_ids,
            "labels": target_ids,
        }

#### 3. Set up and fine-tune the model

In [10]:
def eval_model(model, eval_loader, device):
    model.eval()
    total_loss = 0.0
    for batch in eval_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, labels=labels)
            loss = outputs[0]
            total_loss += loss.item()

    return total_loss / len(eval_loader)

In [11]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
from tqdm import tqdm

checkpoint = "Salesforce/codet5p-220m"
device = "cpu"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint).to(device)

train_dataset = CodeDataset(train_code, train_methods, tokenizer)
eval_dataset = CodeDataset(eval_code, eval_methods, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

optim = torch.optim.AdamW(model.parameters(), lr=0.0005)

# Let's see how our model performs before training
eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=True)
loss = eval_model(model, eval_loader, device)
print(f'loss before fine-tuning: {loss}')

for epoch in range(1, 15):
    model.train()
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
    loss = eval_model(model, eval_loader, device)
    print(f"epoch: {epoch}, loss: {loss}")



loss before fine-tuning: 13.898892974853515


100%|██████████| 20/20 [02:08<00:00,  6.43s/it]


epoch: 0, loss: 4.229760837554932


100%|██████████| 20/20 [02:08<00:00,  6.44s/it]


epoch: 1, loss: 1.7571238994598388


 40%|████      | 8/20 [00:55<01:23,  6.92s/it]


KeyboardInterrupt: 