In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [3]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset


In [4]:
# Synthetic IoT commands and their intents
commands = [
    "turn on the living room fan",
     "turn on the bedroom light",
    "turn off the kitchen fan",
    "turn off the bedroom light",
    "set thermostat to 22 degrees",
    "increase humidity by 5 percent",
    "decrease brightness in the hallway",
    "check the bedroom temperature",
    "what is the current humidity level?"
]

intents = [
    "TURN_ON_DEVICE",
    "TURN_ON_DEVICE",
    "TURN_OFF_DEVICE",
    "TURN_OFF_DEVICE",
    "ADJUST_PARAMETER",
    "ADJUST_PARAMETER",
    "ADJUST_PARAMETER",
    "QUERY_STATUS",
    "QUERY_STATUS"
]

df = pd.DataFrame({'command': commands, 'intent': intents})
print(df)


                               command            intent
0          turn on the living room fan    TURN_ON_DEVICE
1            turn on the bedroom light    TURN_ON_DEVICE
2             turn off the kitchen fan   TURN_OFF_DEVICE
3           turn off the bedroom light   TURN_OFF_DEVICE
4         set thermostat to 22 degrees  ADJUST_PARAMETER
5       increase humidity by 5 percent  ADJUST_PARAMETER
6   decrease brightness in the hallway  ADJUST_PARAMETER
7        check the bedroom temperature      QUERY_STATUS
8  what is the current humidity level?      QUERY_STATUS


In [5]:
label_to_id = {label: i for i, label in enumerate(df['intent'].unique())}
id_to_label = {v: k for k, v in label_to_id.items()}
df['label'] = df['intent'].map(label_to_id)
print("Label mapping:", label_to_id)
print(df)


Label mapping: {'TURN_ON_DEVICE': 0, 'TURN_OFF_DEVICE': 1, 'ADJUST_PARAMETER': 2, 'QUERY_STATUS': 3}
                               command            intent  label
0          turn on the living room fan    TURN_ON_DEVICE      0
1            turn on the bedroom light    TURN_ON_DEVICE      0
2             turn off the kitchen fan   TURN_OFF_DEVICE      1
3           turn off the bedroom light   TURN_OFF_DEVICE      1
4         set thermostat to 22 degrees  ADJUST_PARAMETER      2
5       increase humidity by 5 percent  ADJUST_PARAMETER      2
6   decrease brightness in the hallway  ADJUST_PARAMETER      2
7        check the bedroom temperature      QUERY_STATUS      3
8  what is the current humidity level?      QUERY_STATUS      3


In [6]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print("Training data:")
print(train_df)
print("Test data:")
print(test_df)


Training data:
                               command            intent  label
5       increase humidity by 5 percent  ADJUST_PARAMETER      2
0          turn on the living room fan    TURN_ON_DEVICE      0
8  what is the current humidity level?      QUERY_STATUS      3
2             turn off the kitchen fan   TURN_OFF_DEVICE      1
4         set thermostat to 22 degrees  ADJUST_PARAMETER      2
3           turn off the bedroom light   TURN_OFF_DEVICE      1
6   decrease brightness in the hallway  ADJUST_PARAMETER      2
Test data:
                         command          intent  label
7  check the bedroom temperature    QUERY_STATUS      3
1      turn on the bedroom light  TURN_ON_DEVICE      0


In [7]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Keep only necessary columns for the model
train_dataset = train_dataset.remove_columns(["intent", "__index_level_0__"])
test_dataset = test_dataset.remove_columns(["intent", "__index_level_0__"])


In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(example):
    return tokenizer(example["command"], truncation=True, padding='max_length', max_length=32)

train_encoded = train_dataset.map(tokenize_function, batched=True)
test_encoded = test_dataset.map(tokenize_function, batched=True)

train_encoded = train_encoded.remove_columns(["command"])
test_encoded = test_encoded.remove_columns(["command"])

train_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [9]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=len(label_to_id))


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
training_args = TrainingArguments(
    output_dir="results",
    evaluation_strategy="steps",
    num_train_epochs=20,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_steps=10,
    load_best_model_at_end=True,
    save_total_limit=1,
    save_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoded,
    eval_dataset=test_encoded
)




In [11]:
trainer.train()
trainer.evaluate()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkapooramita[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss
10,1.1468,1.650992
20,0.6059,1.992197
30,0.314,1.704657
40,0.2308,1.639256


{'eval_loss': 1.6392557621002197,
 'eval_runtime': 0.2583,
 'eval_samples_per_second': 7.742,
 'eval_steps_per_second': 3.871,
 'epoch': 20.0}

In [14]:
test_command = "turn on the fan"
inputs = tokenizer(test_command, return_tensors="pt", truncation=True, padding='max_length', max_length=32)
# Remove token_type_ids from inputs
inputs = {key: value for key, value in inputs.items() if key != "token_type_ids"}
with torch.no_grad():
    outputs = model(**inputs)
    predicted_label_id = outputs.logits.argmax(dim=-1).item()

predicted_intent = id_to_label[predicted_label_id]
print(f"Command: {test_command}")
print(f"Predicted Intent: {predicted_intent}")


Command: turn on the fan
Predicted Intent: TURN_ON_DEVICE
