In [1]:
from datasets import load_dataset

dataset = load_dataset("multi_woz_v22")
train_data = dataset['train']
val_data = dataset['validation']

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def filter_and_preprocess(data):
    """
    Filters a list of dictionaries by only including entries with services
    either "restaurant" or "hotel" and having only one service.

    Parameters:
    - data: list of dictionaries containing a "services" key, which is a list of services.

    Returns:
    - List of filtered dictionaries.
    """
    return [entry for entry in data if set(entry["services"]).issubset({"restaurant", "hotel"}) and len(entry["services"]) == 1]


train_data_filtered = filter_and_preprocess(train_data)
val_data_filtered = filter_and_preprocess(val_data)

In [3]:
import pandas as pd

def toDataFrame(raw_data):
    # Initialize an empty list to store rows as dictionaries
    data = []

    # Loop through each dialogue in the training data
    for dialogue in raw_data:

        # Get the number of turns in this dialogue
        num_turns = len(dialogue['turns']['utterance'])

        # Loop through each turn in the dialogue
        for i in range(num_turns):

            # Extract the utterance and corresponding service for this turn
            utterance = dialogue['turns']['utterance'][i]
            service = dialogue['services']

            # Append as a dictionary to the data list
            data.append({'Utterance': utterance, 'Service': service})

    # Save data as pandas df
    df = pd.DataFrame(data)

    # Separate features and labels
    X = df['Utterance']
    Y = df['Service']
    
    return X, Y
        


In [4]:
X_train, Y_train = toDataFrame(train_data_filtered)
X_val, Y_val = toDataFrame(val_data_filtered)

In [5]:
# 6928 utterances are about hotels.
# 11022 utterances are about restaurants.

X_train

0        Guten Tag, I am staying overnight in Cambridge...
1        I have 4 different options for you. I have two...
2        No, but I'd really like to be on the south end...
3        Sure. Does price matter? We can narrow it down...
4        No I don't care about the price. Which one do ...
                               ...                        
17945    nandos serves portuguese food and in the cheap...
17946    I would like the address of Nandos restaurant,...
17947    Nandos is located in the south part of the cit...
17948                                  Thank you, goodbye.
17949                                  Thank you good bye.
Name: Utterance, Length: 17950, dtype: object

In [6]:
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertTokenizer, DistilBertModel

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [8]:
def pre_process_data(X, Y):
    input_encoded = []
    attention_masks = []
    labels = []
    for x,y in zip(X,Y):
        encoded = tokenizer.encode_plus(
            x, 
            add_special_tokens=True, 
            max_length=128, 
            padding='max_length', 
            truncation=True, 
            return_attention_mask=True)
        input_encoded.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        labels.append(0 if y[0] == 'hotel' else 1)
    return torch.tensor(input_encoded), torch.tensor(attention_masks), torch.tensor(labels)

train_input, train_masks, train_labels = pre_process_data(X_train, Y_train)
val_input, val_masks, val_labels = pre_process_data(X_val, Y_val)

In [9]:
train_data = TensorDataset(train_input, train_masks, train_labels)
val_data = TensorDataset(val_input, val_masks, val_labels)

train_loader = DataLoader(train_data, batch_size=16, num_workers=4, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16, num_workers=4)

In [10]:
class BertClassifier(nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.fc = nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :] # selects the [CLS] token position.
        logits = self.fc(cls_output)
        return logits

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

cuda


In [12]:
model = BertClassifier()
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# For storing training and validation loss
train_losses = []
val_losses = []

for epoch in range(3):
    model.train()
    
    # Initialize tqdm progress bar
    train_bar = tqdm(train_loader, desc=f'Training Epoch {epoch}')
    
    train_loss = 0.0
    for batch in train_bar:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Update training loss
        train_loss += loss.item()
        train_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
    
    # Average training loss
    avg_train_loss = train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    
    # Validation
    val_loss = 0.0
    model.eval()

    # Initialize tqdm progress bar for validation
    val_bar = tqdm(val_loader, desc=f'Validation Epoch {epoch}')
    
    with torch.no_grad():
        for batch in val_bar:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            
            # Forward pass
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            
            val_loss += loss.item()
            val_bar.set_postfix({'validation_loss': '{:.3f}'.format(loss.item()/len(batch))})
            
    # Average validation loss
    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    
    print(f'Epoch {epoch}, Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}')


Training Epoch 0: 100%|██████████| 1122/1122 [13:21<00:00,  1.40it/s, training_loss=0.052]
Validation Epoch 0: 100%|██████████| 71/71 [00:28<00:00,  2.53it/s, validation_loss=0.055]


Epoch 0, Training Loss: 0.2594136831623713, Validation Loss: 0.26249390499482694


Training Epoch 1: 100%|██████████| 1122/1122 [17:50<00:00,  1.05it/s, training_loss=0.064]
Validation Epoch 1: 100%|██████████| 71/71 [00:27<00:00,  2.58it/s, validation_loss=0.129]


Epoch 1, Training Loss: 0.1993416667087485, Validation Loss: 0.28058904231014387


Training Epoch 2: 100%|██████████| 1122/1122 [17:49<00:00,  1.05it/s, training_loss=0.085]
Validation Epoch 2: 100%|██████████| 71/71 [00:27<00:00,  2.54it/s, validation_loss=0.060]

Epoch 2, Training Loss: 0.17834211940837297, Validation Loss: 0.27007889954871694





In [13]:
from sklearn.metrics import accuracy_score

all_preds = []
all_labels = []

# Put the model in evaluation mode
model.eval()

# Disable gradient computation
with torch.no_grad():
    # Initialize tqdm progress bar for validation
    val_bar = tqdm(val_loader, desc='Validation')

    for batch in val_bar:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        # Forward pass
        logits = model(input_ids, attention_mask)

        # Get the predicted labels
        _, preds = torch.max(logits, dim=1)

        # Move preds and labels to CPU for further evaluation (if using GPU)
        preds = preds.cpu().numpy()
        labels = labels.cpu().numpy()

        # Extend the list of predictions and labels
        all_preds.extend(preds)
        all_labels.extend(labels)

# Evaluate the model's performance
accuracy = accuracy_score(all_labels, all_preds)
print(f'Validation Accuracy: {accuracy}')




Validation: 100%|██████████| 71/71 [00:27<00:00,  2.58it/s]

Validation Accuracy: 0.8528368794326241





In [14]:
torch.save(model.state_dict(), "topic_classifier_state_dict_4_epochs.pth")

In [18]:
def predict(model, sentence, tokenizer):
    model.eval()
    
    encoded = tokenizer.encode_plus(
        sentence, 
        add_special_tokens=True, 
        max_length=256, 
        padding='max_length', 
        truncation=True, 
        return_attention_mask=True)
    
    input_ids = torch.tensor([encoded['input_ids']], dtype=torch.long).to(device)
    attention_mask = torch.tensor([encoded['attention_mask']], dtype=torch.long).to(device)
    
    # Make a prediction
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
    
    # Decode the prediction to label
    _, preds = torch.max(logits, dim=1)
    label = "hotel" if preds.item() == 0 else "restaurant"
    
    return label


sentence = "I would like to reserve for October 12th at 8:00."
predicted_label = predict(model, sentence, tokenizer)
print("Predicted label:", predicted_label)


Predicted label: restaurant
