In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import torch
import pandas as pd

In [5]:
seq_in_path = "/content/seq.in"
seq_out_path = "/content/seq.out"
labels_path = "/content/label.txt"


In [6]:
seq_in_data = pd.read_csv(seq_in_path, header=None, names=['seq_in'])
seq_out_data = pd.read_csv(seq_out_path, header=None, names=['seq_out'])
labels_data = pd.read_csv(labels_path, header=None, names=['labels'])

In [7]:

combined_data = pd.concat([seq_in_data, seq_out_data, labels_data], axis=1)


combined_data['combined'] = combined_data.apply(
    lambda row: ' '.join(f"{token}_{label}" for token, label in zip(row['seq_in'].split(), row['seq_out'].split())), axis=1
)

In [8]:
df = combined_data[['combined', 'labels']]

In [9]:
df

Unnamed: 0,combined,labels
0,listen_O to_O westbam_B-artist alumb_O allergi...,PlayMusic
1,add_O step_B-entity_name to_I-entity_name me_I...,AddToPlaylist
2,i_O give_O this_O current_B-object_select text...,RateBook
3,play_O the_O song_B-music_item little_B-track ...,PlayMusic
4,please_O add_O iris_B-artist dement_I-artist t...,AddToPlaylist
...,...,...
13079,i_O want_O to_O eat_O choucroute_B-served_dish...,BookRestaurant
13080,play_O funky_B-playlist heavy_I-playlist blues...,PlayMusic
13081,rate_O the_O current_B-object_select album_B-o...,RateBook
13082,go_O to_O the_O photograph_B-object_type the_B...,SearchCreativeWork


In [10]:
df.groupby('labels').describe()

Unnamed: 0_level_0,combined,combined,combined,combined
Unnamed: 0_level_1,count,unique,top,freq
labels,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AddToPlaylist,1818,1812,add_O this_O tune_B-music_item to_O my_B-playl...,2
BookRestaurant,1881,1876,book_O a_O spot_O for_O six_B-party_size_numbe...,2
GetWeather,1896,1885,what_O is_O the_O weather_O forecast_O for_O m...,3
PlayMusic,1914,1885,play_O spotify_B-service,3
RateBook,1876,1822,rate_O this_B-object_select essay_B-object_typ...,4
SearchCreativeWork,1847,1844,find_O the_B-object_name international_I-objec...,2
SearchScreeningEvent,1852,1736,find_O movie_B-object_type times_I-object_type,13


In [11]:
df_no_duplicates = df.drop_duplicates(subset='combined', keep='first')

In [12]:
df_no_duplicates.groupby('labels').describe()

Unnamed: 0_level_0,combined,combined,combined,combined
Unnamed: 0_level_1,count,unique,top,freq
labels,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AddToPlaylist,1812,1812,add_O gabrial_B-artist mcnair_I-artist to_O my...,1
BookRestaurant,1876,1876,i_O want_O to_O eat_O choucroute_B-served_dish...,1
GetWeather,1885,1885,tell_O me_O when_O it_O ll_O be_O chillier_B-c...,1
PlayMusic,1885,1885,play_O funky_B-playlist heavy_I-playlist blues...,1
RateBook,1822,1822,rate_O richard_B-object_name carvel_I-object_n...,1
SearchCreativeWork,1844,1844,go_O to_O the_O photograph_B-object_type the_B...,1
SearchScreeningEvent,1736,1736,can_O i_O get_O the_O movie_B-object_type sche...,1


In [13]:
df_SearchScreeningEvent = df_no_duplicates[df_no_duplicates['labels']=='SearchScreeningEvent']
df_SearchScreeningEvent.shape

(1736, 2)

In [14]:
df_AddToPlaylist = df_no_duplicates[df_no_duplicates['labels']=='AddToPlaylist']
df_RateBook = df_no_duplicates[df_no_duplicates['labels']=='RateBook']
df_SearchCreativeWork = df_no_duplicates[df_no_duplicates['labels']=='SearchCreativeWork']
df_BookRestaurant = df_no_duplicates[df_no_duplicates['labels']=='BookRestaurant']
df_GetWeather = df_no_duplicates[df_no_duplicates['labels']=='GetWeather']
df_PlayMusic = df_no_duplicates[df_no_duplicates['labels']=='PlayMusic']

In [15]:
df_AddToPlaylist_downsize = df_AddToPlaylist.sample(df_SearchScreeningEvent.shape[0])
df_RateBook_downsize = df_RateBook.sample(df_SearchScreeningEvent.shape[0])
df_SearchCreativeWork_downsize = df_SearchCreativeWork.sample(df_SearchScreeningEvent.shape[0])
df_BookRestaurant_downsize = df_BookRestaurant.sample(df_SearchScreeningEvent.shape[0])
df_GetWeather_downsize = df_GetWeather.sample(df_SearchScreeningEvent.shape[0])
df_PlayMusic_downsize = df_PlayMusic.sample(df_SearchScreeningEvent.shape[0])

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
df_balanced = pd.concat([df_SearchScreeningEvent, df_AddToPlaylist_downsize, df_RateBook_downsize, df_SearchCreativeWork_downsize, df_BookRestaurant_downsize, df_GetWeather_downsize, df_PlayMusic_downsize])
df_balanced['labels'].value_counts()

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
SearchScreeningEvent,1736
AddToPlaylist,1736
RateBook,1736
SearchCreativeWork,1736
BookRestaurant,1736
GetWeather,1736
PlayMusic,1736


In [19]:
train_data, val_data = train_test_split(df_balanced, test_size=0.2, random_state=42)

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [21]:
def tokenize_batch(batch):
    return tokenizer(batch['combined'].tolist(), padding=True, truncation=True, return_tensors='pt')

In [22]:
train_encodings = tokenize_batch(train_data)
val_encodings = tokenize_batch(val_data)

In [23]:
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_data['labels'])
val_labels = label_encoder.transform(val_data['labels'])

In [24]:

class IntentSlotDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IntentSlotDataset(train_encodings, train_labels)
val_dataset = IntentSlotDataset(val_encodings, val_labels)

In [25]:
from transformers import BertForSequenceClassification

class CustomBertForSequenceClassification(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.dropout = torch.nn.Dropout(0.1)

num_labels = len(label_encoder.classes_)
model = CustomBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of CustomBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
epochs = 2

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {average_loss}, Accuracy: {accuracy}')


model.eval()
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())


predicted_labels = label_encoder.inverse_transform(all_preds)
true_labels = label_encoder.inverse_transform(all_labels)


accuracy = accuracy_score(true_labels, predicted_labels)
print(f'Accuracy: {accuracy}')
print(classification_report(true_labels, predicted_labels))

Epoch 1/2, Loss: 0.08994914036233431, Accuracy: 0.15590292060880295
Epoch 2/2, Loss: 0.010481519951434549, Accuracy: 0.15590292060880295
Accuracy: 0.9983545865898807
                      precision    recall  f1-score   support

       AddToPlaylist       1.00      0.99      1.00       349
      BookRestaurant       0.99      1.00      1.00       357
          GetWeather       1.00      0.99      1.00       327
           PlayMusic       0.99      1.00      1.00       343
            RateBook       1.00      1.00      1.00       358
  SearchCreativeWork       1.00      1.00      1.00       336
SearchScreeningEvent       1.00      1.00      1.00       361

            accuracy                           1.00      2431
           macro avg       1.00      1.00      1.00      2431
        weighted avg       1.00      1.00      1.00      2431

