In [12]:
import pandas as pd

dataset = pd.read_json(r"D:\Python\ML\ML-Projects\Vox Vission\datasets\intent.json")
print(f"Shape of the dataset: {dataset.shape}")
dataset.head()

Shape of the dataset: (100, 2)


Unnamed: 0,object_detection,non_object_detection
0,Describe what object I am holding in front of me.,Where is Nepal located?
1,Can you identify the object in my hand?,Tell me about Nepal.
2,What objects are in front of me?,What is the capital of Nepal?
3,What do you see in front of me?,Where is Kathmandu?
4,Describe the object I am holding.,Can you chat with me?


In [13]:
#Flatten the queries into a list of text and corresponding labels
data = []
for label, questions in dataset.items():
    for question in questions:
        data.append({"Text": question, "Label": label})

In [14]:
df = pd.DataFrame(data)
print(df.head())

                                                Text             Label
0  Describe what object I am holding in front of me.  object_detection
1            Can you identify the object in my hand?  object_detection
2                   What objects are in front of me?  object_detection
3                    What do you see in front of me?  object_detection
4                  Describe the object I am holding.  object_detection


In [15]:
import re

#Normalize text by removing punctuation and extra spaces
def normalize_text(text):
    return re.sub(r'[^\w\s]', '', text).strip().lower()

In [16]:
df["Text"] = df["Text"].apply(normalize_text)

print(f"Shape of the dataset: {df.shape}")
df.head()

Shape of the dataset: (200, 2)


Unnamed: 0,Text,Label
0,describe what object i am holding in front of me,object_detection
1,can you identify the object in my hand,object_detection
2,what objects are in front of me,object_detection
3,what do you see in front of me,object_detection
4,describe the object i am holding,object_detection


In [17]:
#Define a mapping for labels to numbers
label_mapping = {
    "object_detection": 0,
    "non_object_detection": 1,
}

#Apply the label mapping to convert labels to numbers
df['Label'] = df['Label'].map(label_mapping)
df['Label'] = df['Label'].fillna(0)
df.head()

Unnamed: 0,Text,Label
0,describe what object i am holding in front of me,0
1,can you identify the object in my hand,0
2,what objects are in front of me,0
3,what do you see in front of me,0
4,describe the object i am holding,0


In [18]:
from transformers import BertTokenizer, AutoModelForSequenceClassification
from torch import torch

#Load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

num_labels = len(df['Label'].unique())  #Number of unique labels in your dataset
print(num_labels)

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels) 

2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

#Tokenizing the dataset
def preprocess_function(examples):
    return tokenizer(examples["Text"], padding=True, truncation=True, max_length=128) 

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [20]:
from transformers import  AdamW
from torch.utils.data import DataLoader
from torch.utils.data import Dataset as TorchDataset

#Creating PyTorch Dataset
class CustomDataset(TorchDataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.input_ids = dataset["input_ids"]
        self.attention_mask = dataset["attention_mask"]
        self.labels = dataset["Label"]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.attention_mask[idx], dtype=torch.long),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

train_dataset = CustomDataset(tokenized_dataset)

#Set up the DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

#Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

#Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.train()
epochs = 5
for epoch in range(epochs):
    for batch in train_dataloader:
        
        batch = {key: value.to(device) for key, value in batch.items()}
        optimizer.zero_grad()

        #Forward pass
        outputs = model(**batch)
        
        #Explicit casting: Ensure logits are float and labels are long
        logits = outputs.logits.float()
        labels = batch['labels'].long() 

        #Compute the loss
        loss = torch.nn.functional.cross_entropy(logits.view(-1, model.config.num_labels), labels.view(-1))

        #Backward pass
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")



Epoch 1, Loss: 0.10497608780860901
Epoch 2, Loss: 0.3159833252429962
Epoch 3, Loss: 0.011644667014479637
Epoch 4, Loss: 0.00385460932739079
Epoch 5, Loss: 0.0022353483363986015


In [21]:
#Saving the trained model
model.save_pretrained("./intent_classifier_model")
tokenizer.save_pretrained("./intent_classifier_model")

('./intent_classifier_model\\tokenizer_config.json',
 './intent_classifier_model\\special_tokens_map.json',
 './intent_classifier_model\\vocab.txt',
 './intent_classifier_model\\added_tokens.json')