In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import get_scheduler
from sklearn.metrics import classification_report
from google.colab import drive


In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
#drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
#from google.colab import files
#uploaded = files.upload()  # Upload 'sofmattress_train.csv'
dataset = pd.read_csv('sofmattress_train.csv')   #Replace the file path with your file'spath


Saving sofmattress_train.csv to sofmattress_train.csv


In [None]:
train_data, temp_data = train_test_split(dataset, test_size=0.2, stratify=dataset['label'], random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)


In [None]:
label_map = {label: idx for idx, label in enumerate(dataset['label'].unique())}
train_data['label'] = train_data['label'].map(label_map)
val_data['label'] = val_data['label'].map(label_map)
test_data['label'] = test_data['label'].map(label_map)


In [None]:
class IntentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
train_dataset = IntentDataset(train_data['sentence'], train_data['label'], tokenizer)
val_dataset = IntentDataset(val_data['sentence'], val_data['label'], tokenizer)
test_dataset = IntentDataset(test_data['sentence'], test_data['label'], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_map))


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 10  # 20 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
#Ran this cell 3 times , basically 3 iterations of 10 epochs
epochs = 10
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}/{epochs} - Loss: {loss.item()}")


Epoch 1/10 - Loss: 0.02803821675479412
Epoch 2/10 - Loss: 0.03133375570178032
Epoch 3/10 - Loss: 0.018457576632499695
Epoch 4/10 - Loss: 0.02144080400466919
Epoch 5/10 - Loss: 0.02545170485973358
Epoch 6/10 - Loss: 0.026313627138733864
Epoch 7/10 - Loss: 0.02386494167149067
Epoch 8/10 - Loss: 0.022651419043540955
Epoch 9/10 - Loss: 0.01689682900905609
Epoch 10/10 - Loss: 0.01685645431280136


In [None]:
def evaluate_model(loader, model):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return all_labels, all_preds

val_labels, val_preds = evaluate_model(val_loader, model)
print("Validation Results:")
print(classification_report(val_labels, val_preds, target_names=label_map.keys()))

test_labels, test_preds = evaluate_model(test_loader, model)
print("Test Results:")
print(classification_report(test_labels, test_preds, target_names=label_map.keys()))


Validation Results:
                       precision    recall  f1-score   support

                  EMI       1.00      1.00      1.00         3
                  COD       1.00      1.00      1.00         1
       ORTHO_FEATURES       1.00      1.00      1.00         2
        ERGO_FEATURES       0.00      0.00      0.00         1
           COMPARISON       1.00      1.00      1.00         1
             WARRANTY       1.00      1.00      1.00         1
100_NIGHT_TRIAL_OFFER       1.00      0.50      0.67         2
   SIZE_CUSTOMIZATION       0.33      1.00      0.50         1
   WHAT_SIZE_TO_ORDER       1.00      1.00      1.00         2
             LEAD_GEN       1.00      1.00      1.00         2
        CHECK_PINCODE       1.00      1.00      1.00         1
         DISTRIBUTORS       1.00      1.00      1.00         3
        MATTRESS_COST       1.00      1.00      1.00         2
     PRODUCT_VARIANTS       0.67      1.00      0.80         2
   ABOUT_SOF_MATTRESS       1.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Test Results:
                       precision    recall  f1-score   support

                  EMI       1.00      1.00      1.00         2
                  COD       1.00      1.00      1.00         1
       ORTHO_FEATURES       1.00      1.00      1.00         2
        ERGO_FEATURES       1.00      1.00      1.00         1
           COMPARISON       1.00      1.00      1.00         1
             WARRANTY       1.00      1.00      1.00         1
100_NIGHT_TRIAL_OFFER       1.00      0.50      0.67         2
   SIZE_CUSTOMIZATION       1.00      1.00      1.00         1
   WHAT_SIZE_TO_ORDER       1.00      1.00      1.00         2
             LEAD_GEN       0.67      1.00      0.80         2
        CHECK_PINCODE       1.00      1.00      1.00         1
         DISTRIBUTORS       1.00      1.00      1.00         4
        MATTRESS_COST       1.00      1.00      1.00         3
     PRODUCT_VARIANTS       1.00      1.00      1.00         2
   ABOUT_SOF_MATTRESS       0.00      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
