In [1]:
!pip install transformers datasets evaluate
!pip install rouge_score
!pip install tensorboard

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.

In [2]:
import json
import pandas as pd
from datasets import Dataset

## Loading Data:

The load_data function takes a JSON file, extracts the "invocation" (user input) and "cmd" (bash command) pairs, organizes them into a list of dictionaries, and returns this list.

In [3]:
# Load your JSON data
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    # Extract invocations and commands
    examples = []
    for key, value in data.items():
        examples.append({
            "source": value["invocation"],
            "target": value["cmd"]
        })

    return examples

## Splitting Data into trian-test split.

In [4]:
from sklearn.model_selection import train_test_split
all_examples = load_data("nl2bash.json")
train_examples, val_examples = train_test_split(all_examples, test_size=0.2, random_state=42)

## Converting Data into huggingface format so that it can be used for training CodeT5 model.

In [5]:
# Create HF datasets
train_dataset = Dataset.from_pandas(pd.DataFrame(train_examples))
val_dataset = Dataset.from_pandas(pd.DataFrame(val_examples))

## Loding CodeT5 model and tokenizer.

In [6]:
from transformers import T5ForConditionalGeneration, RobertaTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Load the pre-trained CodeT5 model and tokenizer
model_name = "Salesforce/codet5-small"  # You can also try codet5-small or codet5-large
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Define maximum lengths for source and target
max_source_length = 512
max_target_length = 512

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

## Data Preprocessing:

preprocess_function takes raw text inputs and targets, tokenizes them, masks padding tokens for loss calculation, and organizes the data into a format suitable for training the CodeT5 model. This preparation is essential to enable effective learning and prediction by the model.

In [7]:
# Tokenization function
def preprocess_function(examples):
    # Tokenize inputs
    inputs = [ex for ex in examples["source"]]
    targets = [ex for ex in examples["target"]]

    model_inputs = tokenizer(inputs, max_length=max_source_length, padding="max_length", truncation=True)

    # Tokenize targets
    labels = tokenizer(targets, max_length=max_target_length, padding="max_length", truncation=True)

    # Replace padding token id's with -100 so they are not included in loss computation
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


Here we use the map function to apply the preprocess_function on the training and validation set.

In [8]:
# Apply preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8277 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Map:   0%|          | 0/2070 [00:00<?, ? examples/s]

## Defining Model and Model Training:


In [9]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,  # Use mixed precision training (if your GPU supports it)
    report_to="tensorboard",
    logging_dir="./logs",
)

# Define trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./codet5-finetuned-command-generation")
tokenizer.save_pretrained("./codet5-finetuned-command-generation")

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.707,1.348572
2,1.3345,1.183876
3,1.15,1.093506
4,1.0512,1.036742
5,0.9685,1.006041
6,0.8813,0.989459
7,0.832,0.970009
8,0.7876,0.96405
9,0.753,0.957908
10,0.7294,0.957768


('./codet5-finetuned-command-generation/tokenizer_config.json',
 './codet5-finetuned-command-generation/special_tokens_map.json',
 './codet5-finetuned-command-generation/vocab.json',
 './codet5-finetuned-command-generation/merges.txt',
 './codet5-finetuned-command-generation/added_tokens.json')

## Results processing:
Decoding results and Generating and Accessing Predictions

In [10]:
import evaluate
import numpy as np

def safe_decode_batch(batch_ids):
    cleaned_batch = []
    for seq in batch_ids:
        cleaned_seq = []
        for token in seq:
            if token is None:
                cleaned_seq.append(tokenizer.pad_token_id)
            elif token == -100:
                cleaned_seq.append(tokenizer.pad_token_id)
            else:
                cleaned_seq.append(token)
        cleaned_batch.append(cleaned_seq)
    return tokenizer.batch_decode(cleaned_batch, skip_special_tokens=True)


# # Generate predictions using the trainer
predictions = trainer.predict(tokenized_val)
pred_ids = predictions.predictions
label_ids = predictions.label_ids



## Model Evaluation:
Evaluating model on matrices like Bleu, Rouge1, Rouge2, RougeL and Eaxct Match

In [11]:
import numpy as np
import evaluate
from transformers import DataCollatorForSeq2Seq

# Define metrics
exact_match_metric = evaluate.load("exact_match")
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Replace -100 with the pad token id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode predictions and labels
    decoded_preds = safe_decode_batch(predictions.predictions)
    decoded_labels = safe_decode_batch(predictions.label_ids)

    # Compute BLEU score
    bleu_results = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Compute ROUGE scores
    rouge_results = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Compute exact match
    exact_match_results = exact_match_metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Combine all metrics
    results = {
        "bleu": bleu_results["bleu"],
        "rouge1": rouge_results["rouge1"],
        "rouge2": rouge_results["rouge2"],
        "rougeL": rouge_results["rougeL"],
        "exact_match": exact_match_results["exact_match"],
    }

    return results

# Update the trainer with the metrics
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics,
)

# Run evaluation
evaluation_results = trainer.evaluate()
print(evaluation_results)

Saving Prompts, Predictions and Original Results in an CSV file.

In [12]:
import pandas as pd

def generate_command(invocation):
    inputs = tokenizer(invocation, return_tensors="pt", padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    output_ids = model.generate(
        inputs["input_ids"],
        max_length=512,
        num_beams=5,
        early_stopping=True
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Collect predictions
rows = []
for item in val_dataset:
    prompt = item["source"]
    actual = item["target"]
    predicted = generate_command(prompt)
    rows.append({
        "Prompt": prompt,
        "Predicted Answer": predicted,
        "Original Answer": actual
    })

# Convert to DataFrame and save
df = pd.DataFrame(rows)
df.to_csv("codet5_predictions_epochs-10.csv", index=False)
print("Predictions saved to codet5_predictions_epochs-10.csv")


## Generating answers for manual prompts

In [13]:
from datetime import datetime
import time

def generate_command(invocation):
    # Tokenize input
    inputs = tokenizer(invocation, return_tensors="pt", padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate output
    output_ids = model.generate(
        inputs["input_ids"],
        max_length=512,
        num_beams=5,
        early_stopping=True
    )

    # Decode output
    command = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return command

start_time = datetime.now()

# Example usage
test_invocation = '''Find files containing "TODO" in their name'''
generated_command = generate_command(test_invocation)
# Record end time
end_time = datetime.now()
print(f"Invocation: {test_invocation}")
print(f"Generated command: {generated_command}")
print(f"Process started at: {start_time}")
print(f"Process ended at: {end_time}")
print(f"Total time taken: {end_time - start_time}")

Invocation: Find files containing "TODO" in their name
Generated command: find . -name TODO -print
Process started at: 2025-05-07 15:44:24.767461
Process ended at: 2025-05-07 15:44:25.894182
Total time taken: 0:00:01.126721


In [14]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Function to generate predictions for the validation set
# def get_predictions():
#     predictions = []
#     references = []

#     for i in range(len(val_dataset)):
#         invocation = val_dataset[i]["source"]
#         reference = val_dataset[i]["target"]
#         prediction = generate_command(invocation)

#         predictions.append(prediction)
#         references.append(reference)

#     return predictions, references

# # Generate predictions
# predictions, references = get_predictions()

# # Calculate metrics
# exact_match = sum(1 for p, r in zip(predictions, references) if p == r) / len(predictions)
# bleu_scores = [bleu_metric.compute(predictions=[p], references=[r])["bleu"] for p, r in zip(predictions, references)]
# rouge_scores = [rouge_metric.compute(predictions=[p], references=[r])["rougeL"] for p, r in zip(predictions, references)]





In [15]:
# # Create visualizations
# plt.figure(figsize=(15, 10))

# # Plot 1: Distribution of BLEU scores
# plt.subplot(2, 2, 1)
# sns.histplot(bleu_scores)
# plt.title(f"Distribution of BLEU Scores (Mean: {np.mean(bleu_scores):.4f})")
# plt.xlabel("BLEU Score")

# # Plot 2: Distribution of ROUGE-L scores
# plt.subplot(2, 2, 2)
# sns.histplot(rouge_scores)
# plt.title(f"Distribution of ROUGE-L Scores (Mean: {np.mean(rouge_scores):.4f})")
# plt.xlabel("ROUGE-L Score")

# # Plot 3: Command length comparison
# pred_lengths = [len(p) for p in predictions]
# ref_lengths = [len(r) for r in references]
# plt.subplot(2, 2, 3)
# sns.scatterplot(x=ref_lengths, y=pred_lengths)
# plt.title("Command Length Comparison")
# plt.xlabel("Reference Length")
# plt.ylabel("Prediction Length")
# plt.plot([0, max(ref_lengths)], [0, max(ref_lengths)], linestyle='--', color='r')

# # Plot 4: Metrics summary
# plt.subplot(2, 2, 4)
# metrics = {
#     "Exact Match": exact_match,
#     "Average BLEU": np.mean(bleu_scores),
#     "Average ROUGE-L": np.mean(rouge_scores)
# }
# sns.barplot(x=list(metrics.keys()), y=list(metrics.values()))
# plt.title("Performance Metrics")
# plt.ylim(0, 1)

# plt.tight_layout()
# plt.savefig("model_performance.png")
# plt.show()

In [16]:
# # Create confusion matrix for commonly misclassified commands
# from collections import Counter

# # Find most common commands in the dataset
# common_commands = Counter(references).most_common(10)
# common_cmd_list = [cmd for cmd, _ in common_commands]

# # Create confusion matrix
# confusion_data = []
# for pred, ref in zip(predictions, references):
#     if ref in common_cmd_list:
#         confusion_data.append((ref, pred))

# # Convert to a format suitable for visualization
# confusion_df = pd.DataFrame(confusion_data, columns=["Reference", "Prediction"])
# confusion_matrix = pd.crosstab(confusion_df["Reference"], confusion_df["Prediction"])

# # Plot confusion matrix
# plt.figure(figsize=(12, 10))
# sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
# plt.title("Confusion Matrix for Common Commands")
# plt.ylabel("Reference Command")
# plt.xlabel("Predicted Command")
# plt.tight_layout()
# plt.savefig("confusion_matrix.png")
# plt.show()