In [49]:
import pandas as pd
import random
df = pd.read_csv("ec2_pricing.csv")
raw_pricing_data = df[['API Name','Name', 'Compute Family','Instance Memory','vCPUs','On Demand']]

In [42]:
qa_pairs = []

for idx, row in raw_pricing_data.iterrows():
    api_name = row['API Name']
    name = row['Name']
    family = row['Compute Family']
    memory = row['Instance Memory']
    vcpus = row['vCPUs']
    price = row['On Demand']
    
       # Convert to string if not already
    memory = str(memory)
    vcpus = str(vcpus)
    price = str(price)
    
    questions = [
        (f"What is the API name for {name}?", 
         f"The API name for {name} is {api_name}. This instance is part of the {family} family, equipped with {memory} of RAM and {vcpus}, with an on-demand pricing of {price}."),
        
        (f"What is the full name of {api_name}?", 
         f"The full name of {api_name} is {name}. It belongs to the {family} compute family and offers {memory} memory with {vcpus} processing power at {price}."),
        
        (f"What compute family does {api_name} belong to?", 
         f"The {api_name} instance belongs to the {family} compute family. This instance type, also known as {name}, provides {memory} and {vcpus}, available at {price}."),
        
        (f"How much memory does {api_name} have?", 
         f"The {api_name} instance has {memory} of memory. This {name} instance is a {family} type with {vcpus} and costs {price} on-demand."),
        
        (f"How many vCPUs does {api_name} have?", 
         f"The {api_name} has {vcpus}. This {family} instance, named {name}, comes with {memory} of memory and is priced at {price}."),
        
        (f"What is the on-demand hourly price for {api_name}?", 
         f"The on-demand hourly price for {api_name} is {price}. This {name} instance offers {memory} and {vcpus}, making it suitable for {family} workloads."),
        
        (f"What is the hourly cost of {name}?", 
         f"The hourly cost of {name} ({api_name}) is {price}. This {family} instance provides {memory} of RAM and {vcpus} for your computing needs."),
        
        (f"Which compute family is {name} part of?", 
         f"The {name} is part of the {family} compute family. Known by the API name {api_name}, it features {memory}, {vcpus}, and costs {price}."),
        
        (f"What are the memory specifications for {name}?", 
         f"The {name} ({api_name}) has memory specifications of {memory}. This {family} instance also includes {vcpus} and is available at {price} on-demand."),
        
        (f"How many virtual CPUs does {name} have?", 
         f"The {name} has {vcpus}. This instance, with API name {api_name}, is a {family} type offering {memory} and priced at {price}."),
        
        (f"What is the price per hour for {api_name}?", 
         f"The price per hour for {api_name} is {price}. This {name} instance from the {family} family comes with {memory} and {vcpus}."),
        
        (f"Tell me about the {api_name} instance specifications.", 
         f"The {api_name}, also known as {name}, is a {family} instance with {memory} of memory and {vcpus}. It's available on-demand at {price}."),
        
        (f"What instance type has {memory} and {vcpus}?", 
         f"The instance type with {memory} and {vcpus} is {api_name} ({name}). This {family} instance is priced at {price} on-demand."),
        
        (f"Compare the specs of {api_name}.", 
         f"The {api_name} ({name}) specifications include: Compute Family - {family}, Memory - {memory}, vCPUs - {vcpus}, and On-Demand Price - {price}."),
        
        (f"What is the memory capacity of {api_name}?", 
         f"The memory capacity of {api_name} is {memory}. This {name} instance is classified as {family} and offers {vcpus} at a rate of {price}."),
        
        (f"How much does it cost to run {api_name} for one hour?", 
         f"Running {api_name} for one hour costs {price}. The {name} instance provides {memory} and {vcpus}, optimized for {family} use cases."),
        
        (f"What is the vCPU count for {name}?", 
         f"The vCPU count for {name} is {vcpus}. This instance ({api_name}) is a {family} type with {memory} of memory, costing {price} per hour."),
        
        (f"Which EC2 instance is called {name}?", 
         f"The EC2 instance called {name} has the API name {api_name}. It's a {family} instance featuring {memory}, {vcpus}, and priced at {price}."),
        
        (f"What type of workload is {api_name} optimized for?", 
         f"The {api_name} is optimized for {family} workloads. This {name} instance comes with {memory}, {vcpus}, and costs {price} on-demand."),
        
        (f"Describe the {api_name} instance configuration.", 
         f"The {api_name} instance configuration includes: Name - {name}, Family - {family}, Memory - {memory}, vCPUs - {vcpus}, and hourly cost of {price}."),
        
        (f"What are the key features of {api_name}?", 
         f"Key features of {api_name} ({name}): It's a {family} instance with {memory} RAM, {vcpus} processing power, available at {price} per hour on-demand."),
    ]
    
    for q, a in questions:
        qa_pairs.append({'question': q, 'answer': a})

In [43]:
len(qa_pairs)

25347

In [44]:
import json
with open('ec2_pricing_dataset.jsonl', 'w') as f:
    for pair in qa_pairs:
        f.write(json.dumps(pair) + '\n')

print(f"Generated {len(qa_pairs)} Q&A pairs and saved to ec2_pricing_dataset.jsonl")

# Now load with datasets
from datasets import load_dataset
dataset = load_dataset("json", data_files="ec2_pricing_dataset.jsonl")
print(dataset)

Generated 25347 Q&A pairs and saved to ec2_pricing_dataset.jsonl


Generating train split: 25347 examples [00:00, 683188.57 examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 25347
    })
})





In [45]:
# !pip install huggingface_hub

In [46]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="ec2_pricing_dataset.jsonl",split="train")

In [47]:
dataset_n = dataset.train_test_split(test_size=0.1)

In [48]:
# pip install --upgrade transformers

In [50]:
import torch
import os
# Force CPU usage
os.environ["CUDA_VISIBLE_DEVICES"] = ""
torch.set_default_device("cpu")

# Clear any MPS cache
if torch.backends.mps.is_available():
    torch.mps.empty_cache()


In [51]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/t5-efficient-mini")

In [52]:
# def preprocess_function(examples):
#     # Combine question and answer in instruction format
#     prompts = [f"Question: {q}\nAnswer: {a}" for q, a in zip(examples["question"], examples["answer"])]
    
#     return tokenizer(
#         prompts,
#         truncation=True,
#         max_length=512,
#         padding="max_length"
#     )
# tokenized_dataset = dataset_n.map(preprocess_function, batched=True)
def preprocess_function(examples):
    # tokenize both the questions (input) and answers (target)
    model_inputs = tokenizer(
        examples["question"],
        text_target=examples["answer"],  # pass answers as the target text
        max_length=256,
        truncation=True,
        padding="max_length",
    )
    return model_inputs

tokenized_dataset = dataset_n.map(preprocess_function, batched=True)

Map: 100%|██████████| 22812/22812 [00:02<00:00, 8979.40 examples/s] 
Map: 100%|██████████| 2535/2535 [00:00<00:00, 3359.28 examples/s]


In [None]:
tokenized_dataset

In [53]:
small_train = tokenized_dataset["train"].shuffle(seed=42).select(range(5000))
small_eval = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))
# small_train = tokenized_dataset["train"]
# small_eval = tokenized_dataset["test"]

In [54]:
import numpy as np
import evaluate

metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # extract logits from the predictions tuple
    predictions = predictions[0]

    # ensure predictions are numpy arrays before processing
    # also, clamp to ensure token IDs are within valid range if model outputs unexpected values
    predictions = np.argmax(predictions, axis=-1)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # replace -100 in labels with the tokenizer's pad_token_id for correct decoding
    # this also ensures labels are a numpy array of integers
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # format labels as a list of lists for rouge metric, which expects multiple references
    formatted_labels = [[label] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=formatted_labels)
    return {k: round(v, 4) for k, v in result.items()}

In [55]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="ec2_price_prediction",
    eval_strategy="epoch",
    push_to_hub=True,
)

In [56]:
from transformers import AutoConfig, AutoModelForCausalLM,AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("google/t5-efficient-mini", num_labels=5)
model = model.to("cpu")

Loading weights: 100%|██████████| 92/92 [00:00<00:00, 1928.94it/s, Materializing param=shared.weight]                                                       


In [57]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [58]:
import os

# this line helps pytorch manage gpu memory more efficiently
# by allowing memory segments to expand, reducing fragmentation.
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:true"

# now you can import torch and proceed with your model loading and training
import torch

# you might also want to confirm it's set
print(f"pytorch_cuda_alloc_conf is set to: {os.environ.get('PYTORCH_CUDA_ALLOC_CONF')}")
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"


pytorch_cuda_alloc_conf is set to: expandable_segments:true


In [61]:
from transformers import Trainer,TrainingArguments
output_dir_model = "./ec2_qa_model"
# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir_model,
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # gradient_accumulation_steps=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=False,  # MPS doesn't support fp16, use default
    dataloader_num_workers=0,
    # no_cuda = True,
    logging_steps=100,
    save_total_limit=1,
    # gradient_checkpointing=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_eval,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

  super().__init__(loader)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.save_model("./ec2_qa_final_model")
tokenizer.save_pretrained("./ec2_qa_final_model")

In [None]:
my_model = AutoModelForSeq2SeqLM.from_pretrained("ec2_qa_model/checkpoint-15")
my_tokenizer = AutoTokenizer.from_pretrained("ec2_qa_model/checkpoint-15")
model = model.to("cpu")
model.eval()

Loading weights: 100%|██████████| 92/92 [00:00<00:00, 1696.79it/s, Materializing param=shared.weight]                                                       


T5ForConditionalGeneration(
  (shared): Embedding(32128, 384)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 384)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=384, out_features=512, bias=False)
              (k): Linear(in_features=384, out_features=512, bias=False)
              (v): Linear(in_features=384, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=384, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=384, out_features=1536, bias=False)
              (wo): Linear(in_features=1536, out_features=384, bias=False)
              (dropout): Drop

In [None]:
def generate_text(prompt, max_length=100):
    inputs = my_tokenizer(prompt, return_tensors="pt").to("cpu")
    
    outputs = my_model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with a sample prompt
prompt = "EC2 instance"
generated = generate_text(prompt)
print(generated)