In [None]:
!pip install -q datasets
!pip install -q transformers

These lines import the necessary modules and libraries, including datasets for loading datasets, transformers for working with transformer models, and tensorflow for training the model

In [None]:
import datasets
import transformers
import tensorflow as tf

#### Load the code_search_net dataset

These lines import the necessary modules and libraries, including datasets for loading datasets, transformers for working with transformer models, and tensorflow for training the model.

In [None]:
python_dataset = datasets.load_dataset("code_search_net","python")

This line loads the "code_search_net" dataset with the language set to Python and assigns it to the variable python_dataset.

In [None]:
train_dataset = python_dataset["train"]
# Display the first 5 examples
for example in train_dataset[:5]:
    print(example)

#### Train the model


This code snippet iterates over the first 5 examples in the training dataset and prints each example. It is used for displaying the details of the dataset.



In [None]:
train_dataset = python_dataset["train"]
val_dataset = python_dataset["validation"]

This line assigns the validation subset of the loaded dataset to the variable val_dataset.



In [None]:
from transformers import TFAutoModelWithLMHead, AutoTokenizer,RobertaTokenizer,TFRobertaModel

This line instantiates a TensorFlow model using the "roberta-base" pre-trained weights. The model architecture used is based on the RoBERTa model.



In [None]:
# Instantiate the model
model = TFRobertaModel.from_pretrained("roberta-base")

# Instantiate the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

These lines iterate over the examples in the training dataset and extract the code tokens and documentation strings. The code tokens are stored in the code_examples list, and the documentation strings are stored in the comment_examples list.

In [None]:
code_examples = []
comment_examples = []

for example in train_dataset:
    code_examples.append(example["func_code_tokens"])
    comment_examples.append(example["func_documentation_string"])

This code snippet uses the tokenizer to encode the comment examples. The comments are padded, truncated, and limited to a maximum length of 128 tokens. The encoded comments are returned as TensorFlow tensors and stored in the tokenized_comments dictionary.

In [None]:
tokenized_comments = tokenizer.batch_encode_plus(
    comment_examples,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="tf"
)

These lines extract the input IDs, attention masks, and target input IDs from the tokenized_comments dictionary. The input_ids and attention_mask are used as input to the model, while target_input_ids is the expected output used for training.

In [None]:
# Prepare the input and target tensors
input_ids = tokenized_comments["input_ids"][:, :-1]
attention_mask = tokenized_comments["attention_mask"][:, :-1]
target_input_ids = tokenized_comments["input_ids"][:, 1:]

### Fine-tune the model


These lines define the loss function and optimizer used for training the model. The loss function is sparse categorical cross-entropy, and the optimizer is Adam with a learning rate of 1e-5.



In [None]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

These lines define the loss function and optimizer used for training the model. The loss function is sparse categorical cross-entropy, and the optimizer is Adam with a learning rate of 1e-5.

In [None]:
@tf.function
def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        logits = model(inputs)[0]
        loss = loss_fn(targets, logits)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

In [None]:
for epoch in range(5):
    total_loss = 0
    num_batches = 0
    for inputs, targets in zip(input_ids, target_input_ids):
        inputs = tf.expand_dims(inputs, 0)
        targets = tf.expand_dims(targets, 0)
        batch_loss = train_step(inputs, targets)
        total_loss += batch_loss
        num_batches += 1
    average_loss = total_loss / num_batches
    print(f"Epoch {epoch + 1}: Loss = {average_loss}")