In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! pip install xgboost



In [19]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [22]:
! pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=89b4a96ad622e27e72d851aa9ea300d944cda2776dddcb539497b76cc02d7ac9
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


# Generate Command based on User Input
The dataset consists of the following types:
* Create Namespace
* Delete Namespace
* Get Namespace
* Apply YAML Configs
* Describe Pod given a Pod Name
* Delete a Service
* Delete a Deployment
* Delete a Statefulset
* Delete a Secret
* Get Pods
* Get Services
* Get Deployment
* Get Statefulset
* Get Configmap
* Get Secret
* Get Pod Namespace
* Get Service Namespace

The goal is to be able to eventually auto generate commands given the user's natural language input in English.

This can be broken down into two aspects:
1. A model to classify what kind of command it is based on the above mentioned categories. But these categories can be clustered together even further.
2. A model to generate the actual command based on the input that the user has provided.

### Prepare Data

In [10]:
import pandas as pd
import json

json_path = '/content/drive/MyDrive/orchestro/data-classes.json'
json_data = None
def create_dataset(json_path):
  with open(json_path, 'r') as f:
    json_data = json.load(f)
    return json_data

json_data = create_dataset(json_path)

In [11]:
from sklearn.model_selection import train_test_split

# Prepare data lists
prompts, commands, types = [], [], []
for cmd_type, entries in json_data.items():
    for entry in entries:
        prompts.append(entry['prompt'])
        commands.append(entry['command'])
        types.append(cmd_type)

In [12]:
# Split into training and testing sets
prompts_train, prompts_test, types_train, types_test = train_test_split(prompts, types, test_size=0.2, random_state=42)
print("Sample training prompt:", prompts_train[0])
print("Corresponding command type:", types_train[0])

Sample training prompt: Can you show me the pods that are currently deployed?
Corresponding command type: get


Vectorize the data - prompts - before this can be used with the classifier(s). This will classify what type of command it is so that it can be used to get predictions from a model as to what the corresponding command should be.

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize prompts
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(prompts_train)
X_test = vectorizer.transform(prompts_test)

### Classification Model
This is the first model in this proposed mult-modal approach where the question is classified into the type it is.

In [14]:
# Build several classifiers to identify which model performs best for Classification
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Train the SVM classifier
clf = SVC(kernel='linear', probability=True)
clf.fit(X_train, types_train)
types_pred = clf.predict(X_test)
accuracy = accuracy_score(types_test, types_pred)
print(f"SVC Accuracy: {accuracy * 100:.2f}%")
print("SVC Classification Report:\n", classification_report(types_test, types_pred))

# Train the Logistic Regression Classifier
model = LogisticRegression()
model.fit(X_train, types_train)
types_pred = model.predict(X_test)
accuracy = accuracy_score(types_test, types_pred)
print(f"\nLogistic Regression Accuracy: {accuracy * 100:.2f}%")
print("Logistic Regression Classification Report:\n", classification_report(types_test, types_pred))

# Train the XGBoost Classifier
label_encoder = LabelEncoder()

X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

y_train_encoded = label_encoder.fit_transform(types_train)
y_test_encoded = label_encoder.transform(types_test)

label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Label Encoding Mapping: ", label_mapping)

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_dense, y_train_encoded)
y_pred_xgb = xgb_model.predict(X_test_dense)

accuracy_xgb = accuracy_score(y_test_encoded, y_pred_xgb)
print(f"\nXGBoost Accuracy: {accuracy_xgb * 100:.2f}%")
print("XGBoost Classification Report:\n", classification_report(y_test_encoded, y_pred_xgb))

SVC Accuracy: 96.83%
SVC Classification Report:
               precision    recall  f1-score   support

       apply       1.00      1.00      1.00         8
      create       0.80      0.80      0.80         5
      delete       0.93      0.93      0.93        15
    describe       1.00      1.00      1.00         7
         get       1.00      1.00      1.00        28

    accuracy                           0.97        63
   macro avg       0.95      0.95      0.95        63
weighted avg       0.97      0.97      0.97        63


Logistic Regression Accuracy: 98.41%
Logistic Regression Classification Report:
               precision    recall  f1-score   support

       apply       1.00      1.00      1.00         8
      create       1.00      0.80      0.89         5
      delete       0.94      1.00      0.97        15
    describe       1.00      1.00      1.00         7
         get       1.00      1.00      1.00        28

    accuracy                           0.98        63


Parameters: { "use_label_encoder" } are not used.




XGBoost Accuracy: 96.83%
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      0.80      0.89         5
           2       0.94      1.00      0.97        15
           3       1.00      0.86      0.92         7
           4       0.97      1.00      0.98        28

    accuracy                           0.97        63
   macro avg       0.98      0.93      0.95        63
weighted avg       0.97      0.97      0.97        63



In [33]:
from sklearn.preprocessing import LabelEncoder
import pickle

# Assuming `types_train` contains the command types in the training data
label_encoder = LabelEncoder()
label_encoder.fit(types_train)  # Fit on unique command types

# Save the label encoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

From the above result it can be observed that LogisticRegression as a classifier performs well for classification of text into the relevant category. This approach is useful to decide what kind of command that needs to be generated in the end. The actual generation of the command will be handled by a different model.

### Zero Shot - Prediction
Test BERT by not fine tuning and check how well it performs on a random data point (input question).

The following is a zero shot approach taken to test how well the novel BERT model performs on prompts.

In [17]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Helper function to get embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

# Example usage: Obtain embeddings for a sample prompt
sample_prompt = "Please create a namespace named 'production'."
sample_embedding = get_embeddings(sample_prompt)

# Calculate similarity for best matching template
relevant_prompts = [p for p in prompts_train if clf.predict(vectorizer.transform([p])) == 'create']
relevant_embeddings = [get_embeddings(p) for p in relevant_prompts]
similarities = [cosine_similarity(sample_embedding.detach().numpy(), emb.detach().numpy()) for emb in relevant_embeddings]
best_match_index = similarities.index(max(similarities))

print("Best matching prompt:", relevant_prompts[best_match_index])
print("Generated command:", commands[best_match_index])



Best matching prompt: Please create a namespace named 'production'.
Generated command: kubectl create namespace <name>


Check the Performance of BERT and SBERT. We calculate the Accuracy and ROUGE Score on both the models by getting predictions on the validation set of the dataset to better understand how well they perform overall without any training.

### Zero Shot - Evaluate BERT

In [23]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import evaluate

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
rouge = evaluate.load("rouge")

# Helper function to get embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Generate embeddings for training prompts
train_embeddings = [get_embeddings(prompt) for prompt in prompts_train]

# Function to predict command type based on cosine similarity
def predict_command_type(test_prompt_embedding, k=1):
    similarities = [cosine_similarity(test_prompt_embedding, train_emb).flatten()[0] for train_emb in train_embeddings]
    top_k_indices = np.argsort(similarities)[-k:][::-1]  # Indices of Top-k similar commands

    # Get the command types for the top-k indices
    top_k_types = [types_train[i] for i in top_k_indices]
    return top_k_types, top_k_indices

# Calculate Top-1 and Top-k accuracy
top_k_values = [1, 3, 5]  # Adjust k values as needed
accuracies = {k: 0 for k in top_k_values}
rouge_scores = []

for i, test_prompt in enumerate(prompts_test):
    test_prompt_embedding = get_embeddings(test_prompt)
    true_type = types_test[i]
    true_command = commands[i]

    # Predict top-k types and check accuracy
    for k in top_k_values:
        predicted_types, top_k_indices = predict_command_type(test_prompt_embedding, k)
        if true_type in predicted_types:
            accuracies[k] += 1

        # Calculate ROUGE for the best match (Top-1)
        if k == 1:
            best_match_command = commands[top_k_indices[0]]
            rouge_result = rouge.compute(predictions=[best_match_command], references=[true_command])
            rouge_scores.append(rouge_result['rougeL'])

In [24]:
# Calculate and print accuracy for each k
total_tests = len(prompts_test)
for k, count in accuracies.items():
    print(f"Top-{k} Accuracy: {count / total_tests:.2%}")

# Calculate and print average ROUGE-L score
average_rouge_score = sum(rouge_scores) / len(rouge_scores)
print(f"Average ROUGE-L Score: {average_rouge_score:.2%}")

Top-1 Accuracy: 98.41%
Top-3 Accuracy: 100.00%
Top-5 Accuracy: 100.00%
Average ROUGE-L Score: 52.10%


### Zero Shot - Evaluate SBERT

In [25]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import evaluate

# Load SBERT model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Initialize ROUGE scorer
rouge = evaluate.load("rouge")

# Function to get embeddings using SBERT
def get_embeddings(text):
    return model.encode(text)

# Generate embeddings for training prompts
train_embeddings = [get_embeddings(prompt) for prompt in prompts_train]

# Function to predict command type based on cosine similarity
def predict_command_type(test_prompt_embedding, k=1):
    similarities = [cosine_similarity([test_prompt_embedding], [train_emb]).flatten()[0] for train_emb in train_embeddings]
    top_k_indices = np.argsort(similarities)[-k:][::-1]  # Indices of Top-k similar commands

    # Get the command types for the top-k indices
    top_k_types = [types_train[i] for i in top_k_indices]
    return top_k_types, top_k_indices

# Calculate Top-k accuracy and ROUGE score
top_k_values = [1, 3, 5]  # Adjust k values as needed
accuracies = {k: 0 for k in top_k_values}
rouge_scores = []

for i, test_prompt in enumerate(prompts_test):
    test_prompt_embedding = get_embeddings(test_prompt)
    true_type = types_test[i]
    true_command = commands[i]  # The actual command for the test prompt

    # Predict top-k types and check accuracy
    for k in top_k_values:
        predicted_types, top_k_indices = predict_command_type(test_prompt_embedding, k)
        if true_type in predicted_types:
            accuracies[k] += 1

        # Calculate ROUGE for the best match (Top-1)
        if k == 1:
            best_match_command = commands[top_k_indices[0]]
            rouge_result = rouge.compute(predictions=[best_match_command], references=[true_command])
            rouge_scores.append(rouge_result['rougeL'])

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [26]:
# Calculate and print accuracy for each k
total_tests = len(prompts_test)
for k, count in accuracies.items():
    print(f"Top-{k} Accuracy: {count / total_tests:.2%}")

# Calculate and print average ROUGE-L score
average_rouge_score = sum(rouge_scores) / len(rouge_scores)
print(f"Average ROUGE-L Score: {average_rouge_score:.2%}")

Top-1 Accuracy: 92.06%
Top-3 Accuracy: 96.83%
Top-5 Accuracy: 100.00%
Average ROUGE-L Score: 49.67%


From the above it can be observed that the BERT Model - bert-base-uncased. This is the model that we then fine tune further to improve and ensure quality for the use case in hand.

### Fine tuning BERT Model

In [28]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset, DatasetDict
import numpy as np
import evaluate
import os

os.environ['WANDB_DISABLED'] = 'true'

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(types)))

# Encode labels
label_encoder = LabelEncoder()
types_encoded = label_encoder.fit_transform(types)

# Split the data into training and testing
prompts_train, prompts_test, types_train, types_test = train_test_split(prompts, types_encoded, test_size=0.2, random_state=42)

# Prepare datasets for Hugging Face's Trainer
train_dataset = Dataset.from_dict({'text': prompts_train, 'label': types_train})
test_dataset = Dataset.from_dict({'text': prompts_test, 'label': types_test})

# Tokenize the datasets
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set up data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define evaluation metrics
accuracy_metric = evaluate.load("accuracy")
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Compute accuracy
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)['accuracy']

    # Compute ROUGE-L for Top-1
    decoded_predictions = label_encoder.inverse_transform(predictions)
    decoded_labels = label_encoder.inverse_transform(labels)

    rouge_scores = []
    for pred, label in zip(decoded_predictions, decoded_labels):
        pred_text = commands[types.index(pred)]  # Get command text from type
        label_text = commands[types.index(label)]
        rouge_result = rouge_metric.compute(predictions=[pred_text], references=[label_text])
        rouge_scores.append(rouge_result['rougeL'])

    avg_rouge_l = np.mean(rouge_scores)
    return {"accuracy": accuracy, "rougeL": avg_rouge_l}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate()
print(metrics)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,Rougel
1,No log,1.164554,0.714286,0.823129
2,No log,0.620918,0.904762,0.94898
3,No log,0.470793,0.920635,0.956916


{'eval_loss': 0.47079339623451233, 'eval_accuracy': 0.9206349206349206, 'eval_rougeL': 0.9569160997732427, 'eval_runtime': 18.5014, 'eval_samples_per_second': 3.405, 'eval_steps_per_second': 0.432, 'epoch': 3.0}


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pickle

# Load the fine-tuned model and tokenizer
tokenizer = BertTokenizer.from_pretrained('/content/results')  # Path to the saved model
model = BertForSequenceClassification.from_pretrained('/content/results')
model.eval()  # Set model to evaluation mode

# Load label encoder
with open("label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

# Helper function to predict command type using the fine-tuned model
def predict_command_type(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_label_id = torch.argmax(logits, dim=1).item()
    predicted_type = label_encoder.inverse_transform([predicted_label_id])[0]
    return predicted_type

# Helper function to get embeddings for fine-grained matching within predicted command type
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model.bert(**inputs)  # Use the underlying BERT model without classification head
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Example usage: Predict command type and find best matching prompt
sample_prompt = "Please create a namespace named 'production'."
predicted_type = predict_command_type(sample_prompt)

# Find relevant prompts and commands based on the predicted command type
relevant_prompts = [p for p, t in zip(prompts_train, types_train) if t == predicted_type]
relevant_commands = [c for c, t in zip(commands, types) if t == predicted_type]

# If multiple commands of the predicted type exist, use similarity matching
sample_embedding = get_embeddings(sample_prompt)
relevant_embeddings = [get_embeddings(p) for p in relevant_prompts]
similarities = [cosine_similarity([sample_embedding], [emb]).flatten()[0] for emb in relevant_embeddings]
best_match_index = similarities.index(max(similarities))

# Output the best matching prompt and its associated command
print("Predicted command type:", predicted_type)
print("Best matching prompt:", relevant_prompts[best_match_index])
print("Generated command:", relevant_commands[best_match_index])

Fill the placeholders that are present in the generated command by extracting the relevant entity from the input from the user. This will provide the final command that can be then executed in kubectl to perform the action.

In [36]:
import re

def fill_placeholders(command, prompt, namespace_name=None):
    # Extract context (namespace name) from prompt or carry over if provided
    if namespace_name:
        command = command.replace('<name>', namespace_name)
    else:
        name_match = re.search(r"named '(\w+)'", prompt)
        if name_match:
            name = name_match.group(1)
            command = command.replace('<name>', name)
            return command, name  # Return updated command and extracted name
    return command, namespace_name

# Testing placeholder filling
matched_command = commands[best_match_index]
filled_command, namespace_name = fill_placeholders(matched_command, sample_prompt)
print("Filled command:", filled_command)
print("Namespace name context:", namespace_name)

Filled command: kubectl create namespace production
Namespace name context: production
