In [None]:
# specifying the package versions to ensure reproducibility 

requirements = [
    'torch',
    'transformers==4.10.2',
    'tokenizers==0.10.3',
    'numpy==1.21.2',
    'pandas==1.3.3',
    'tensorflow==2.6.0'
]

with open('requirements.txt', 'w') as file:
    file.write('\n'.join(requirements))

!pip install -r requirements.txt


In [3]:
# importing the necessary libraries

import pandas as pd
import numpy as np
import torch
import random

In [4]:
!pip install --upgrade torch torchvision torchaudio



In [5]:
import os
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [6]:
# setting seed throughout the notebook to make the codes reproducible.

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.set_num_threads(1)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [7]:

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [8]:
# installing the required libraries


IS_ON_COLAB = bool(os.getenv("COLAB_RELEASE_TAG"))

if IS_ON_COLAB :
  !pip install transformers==4.28.0
  !pip install tokenizers datasets sentencepiece huggingface_hub[cli] accelerate

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.16.4 tokenizers-0.13.3 transformers-4.28.0
Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [9

In [9]:
from transformers import pipeline
from datasets import load_dataset

In [10]:
# I am working with the nlu_evaluation_data dataset from HuggingFace, which is a dataset containing short utterences from conversational domain annotated with their corrsponding scenarios and intents. It can basically be thought as the commands given to a virtual assistant technology

Loading the dataset

In [11]:

nlu = load_dataset('nlu_evaluation_data', name = 'split' )

Downloading builder script:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.50k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/11.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/779k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
# getting the idea of the dataset

nlu

DatasetDict({
    train: Dataset({
        features: ['text', 'scenario', 'label'],
        num_rows: 25715
    })
})

In [13]:
# to split the data into train, cross-validation and test set

from sklearn.model_selection import train_test_split

nlu_train = nlu['train'].train_test_split(test_size = 0.1, seed = 42)
nlu_cv = nlu_train['test'].train_test_split(test_size = 0.5, seed = 42)

In [14]:
# creating the train, cross-evaluation and test datasets

nlu_train_set = nlu_train['train']
nlu_cv_set = nlu_cv['train']
nlu_test_set = nlu_cv['test']

In [15]:
nlu_train_set

Dataset({
    features: ['text', 'scenario', 'label'],
    num_rows: 23143
})

In [16]:
nlu_cv_set

Dataset({
    features: ['text', 'scenario', 'label'],
    num_rows: 1286
})

In [17]:

nlu_test_set

Dataset({
    features: ['text', 'scenario', 'label'],
    num_rows: 1286
})

Feature Extraction using TF-IDF vectorizer

In [18]:
# feature extraction to solve the task of text classification by different sklearn models.

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(nlu_train_set["text"])
X_cv_tfidf = vectorizer.transform(nlu_cv_set["text"])
X_test_tfidf = vectorizer.transform(nlu_test_set['text'])

y_train = nlu_train_set["label"]
y_CV = nlu_cv_set["label"]
y_test = nlu_test_set['label']

Logistic Regression

```

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score ,  make_scorer
from sklearn.linear_model import LogisticRegression


# Define the hyperparameters and their possible values
param_grid = {

    'C': [  0.1, 1,5 ],
    'max_iter': [500, 1000],
    'penalty': ['l1', 'l2',],
    'solver' : ['liblinear', 'lbfgs'],
    'fit_intercept' : [True, False],
    'random_state' : [42]


}

logistic_regression_model = LogisticRegression()


scorer = make_scorer(accuracy_score)
grid_search = GridSearchCV(logistic_regression_model, param_grid, cv=5, scoring = scorer)

grid_search.fit(X_train_tfidf, y_train)

print("Best Hyperparameters:", grid_search.best_params_)


In [19]:
# choosing the parameters that I got after running the GridSearch

logistic_regression_params = {'C': 5,
                              'fit_intercept': True,
                              'max_iter': 500,
                              'penalty': 'l2',
                              'random_state': 42,
                              'solver': 'lbfgs'}

In [20]:
# training my model on the hyperparameters that I got from from the GridSearch

from sklearn.linear_model import LogisticRegression

logistic_regression_model = LogisticRegression(**logistic_regression_params)
logistic_regression_model.fit(X_train_tfidf, y_train)



In [21]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

dataset_names = ['cv', 'test']
datasets = [X_cv_tfidf, X_test_tfidf]

for dataset_name, dataset in zip(dataset_names, datasets):


    y_pred = logistic_regression_model.predict(dataset)
    data_set_name = globals()[f'nlu_{dataset_name}_set']

    accuracy = accuracy_score(data_set_name['label'], y_pred)
    f1score = f1_score(data_set_name['label'], y_pred, average='weighted', zero_division = 1)


    print(f'Dataset: {dataset_name}')
    print(f'Accuracy of the {dataset_name} set for Logistic Regression: {accuracy}')
    print(f'F1 Score of the {dataset_name} set for Logistic Regression: {f1score}')


Dataset: cv
Accuracy of the cv set: 0.8343701399688958
F1 Score of the cv set: 0.8376127331852897
Dataset: test
Accuracy of the test set: 0.859253499222395
F1 Score of the test set: 0.860147877986756


In [22]:
# to check how the logistic regression model performed on custom text

custom_text = ['why do I not feel well today']

custom_text_tfidf = vectorizer.transform(custom_text)


In [23]:

intents = [
    "alarm_query",
    "alarm_remove",
    "alarm_set",
    "audio_volume_down",
    "audio_volume_mute",
    "audio_volume_other",
    "audio_volume_up",
    "calendar_query",
    "calendar_remove",
    "calendar_set",
    "cooking_query",
    "cooking_recipe",
    "datetime_convert",
    "datetime_query",
    "email_addcontact",
    "email_query",
    "email_querycontact",
    "email_sendemail",
    "general_affirm",
    "general_commandstop",
    "general_confirm",
    "general_dontcare",
    "general_explain",
    "general_greet",
    "general_joke",
    "general_negate",
    "general_praise",
    "general_quirky",
    "general_repeat",
    "iot_cleaning",
    "iot_coffee",
    "iot_hue_lightchange",
    "iot_hue_lightdim",
    "iot_hue_lightoff",
    "iot_hue_lighton",
    "iot_hue_lightup",
    "iot_wemo_off",
    "iot_wemo_on",
    "lists_createoradd",
    "lists_query",
    "lists_remove",
    "music_dislikeness",
    "music_likeness",
    "music_query",
    "music_settings",
    "news_query",
    "play_audiobook",
    "play_game",
    "play_music",
    "play_podcasts",
    "play_radio",
    "qa_currency",
    "qa_definition",
    "qa_factoid",
    "qa_maths",
    "qa_stock",
    "recommendation_events",
    "recommendation_locations",
    "recommendation_movies",
    "social_post",
    "social_query",
    "takeaway_order",
    "takeaway_query",
    "transport_query",
    "transport_taxi",
    "transport_ticket",
    "transport_traffic",
    "weather_query"]

In [24]:
pred = logistic_regression_model.predict(custom_text_tfidf)
predicted_label = [intents[i] for i in pred]
print(f'The predicted label of the custom text for Logistic Regression  is {predicted_label}')

The predicted label of the custom text is ['general_quirky']


Decision Tree model

In [25]:
# to find out the best parameters for my Decision Tree model by doing a GridSearch

```
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their possible values
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 2, 5],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt', 'log2',  None],
    'random_state': [42],
}
decision_tree_model = DecisionTreeClassifier()

grid_search = GridSearchCV(decision_tree_model, param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train)

print("Best Hyperparameters:", grid_search.best_params_)


```

In [26]:
# choosing the parameters that I got after running the GridSearch

decision_tree_params = {'criterion': 'gini',
                        'max_depth': None,
                        'max_features': None,
                        'min_samples_leaf': 1,
                        'min_samples_split': 2,
                        'random_state': 42,
                        'splitter': 'random',
                        }

In [27]:
# to fit the decision tree model on my vectorized training data

from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier(**decision_tree_params)
decision_tree_model.fit(X_train_tfidf, y_train)

In [28]:
# to evaluate the model performance on validation and test sets

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

dataset_names = ['cv', 'test']
datasets = [X_cv_tfidf, X_test_tfidf]

for dataset_name, dataset in zip(dataset_names, datasets):


    y_pred = decision_tree_model.predict(dataset)
    data_set_name = globals()[f'nlu_{dataset_name}_set']

    accuracy = accuracy_score(data_set_name['label'], y_pred)
    f1score = f1_score(data_set_name['label'], y_pred, average='weighted', zero_division = 1)


    print(f'Dataset: {dataset_name}')
    print(f'Accuracy of the {dataset_name} set for the decision tree model: {accuracy}')
    print(f'F1 Score of the {dataset_name} set for the decision tree model: {f1score}')



Dataset: cv
Accuracy of the cv set for the decision tree model: 0.744945567651633
F1 Score of the cv set for the decision tree model: 0.7466709120039313
Dataset: test
Accuracy of the test set for the decision tree model: 0.755054432348367
F1 Score of the test set for the decision tree model: 0.7579808794551058


In [29]:
# to check how the model performed on custom text

custom_text = ['why do I not feel well today']

custom_text_tfidf = vectorizer.transform(custom_text)

In [30]:
pred = decision_tree_model.predict(custom_text_tfidf)
predicted_label = [intents[i] for i in pred]
print(f'The predicted label of the custom text of the decision tree model is {predicted_label}')

The predicted label of the custom text of the decision tree model is ['alarm_remove']


Random Forest model


In [31]:
# to find out the best parameters for my Random Forest model by doing a GridSearch

```

from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2,5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt'],
    'random_state' : [42]
}
random_forest_model = RandomForestClassifier()

grid_search = GridSearchCV(random_forest_model, param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train)

print("Best Hyperparameters:", grid_search.best_params_)


In [32]:
 # choosing the parameters that I got after running the GridSearch

 random_forest_params = {'max_depth': None,
                         'max_features': 'auto',
                         'min_samples_leaf': 1,
                         'min_samples_split': 5,
                         'n_estimators': 200,
                         'random_state': 42,
                         }

In [33]:
from sklearn.ensemble import RandomForestClassifier


random_forest_model = RandomForestClassifier(**random_forest_params)
random_forest_model.fit(X_train_tfidf, y_train)

  warn(


In [34]:
from sklearn.metrics import accuracy_score, f1_score

dataset_names = ['cv', 'test']
datasets = [X_cv_tfidf, X_test_tfidf]

for dataset_name, dataset in zip(dataset_names, datasets):


    y_pred = random_forest_model.predict(dataset)
    data_set_name = globals()[f'nlu_{dataset_name}_set']

    accuracy = accuracy_score(data_set_name['label'], y_pred)
    f1score = f1_score(data_set_name['label'], y_pred, average='weighted', zero_division = 1)


    print(f'Dataset: {dataset_name}')
    print(f'Accuracy of the {dataset_name} set for the random forest model: {accuracy}')
    print(f'F1 Score of the {dataset_name} set for the random forest model: {f1score}')



Dataset: cv
Accuracy of the cv set for the random forest model: 0.80248833592535
F1 Score of the cv set for the random forest model: 0.8012025958478614
Dataset: test
Accuracy of the test set for the random forest model: 0.8211508553654744
F1 Score of the test set for the random forest model: 0.8206672262674121


In [35]:
# to check how the model performed on custom text

custom_text = ['why do I not feel well today']

custom_text_tfidf = vectorizer.transform(custom_text)


In [36]:
pred = random_forest_model.predict(custom_text_tfidf)
predicted_label = [intents[i] for i in pred]
print(f'The predicted label of the custom text of the random forest model is {predicted_label}')

The predicted label of the custom text of the random forest model is ['general_negate']


Gradient Boost model

In [37]:
# performing the Grid Search for hyperparameters for the Gradient Boost Model in the markdown

```

import xgboost as xgb

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.2],
    'max_depth': [3, 4],
    'min_child_weight': [1, 2],
    'subsample': [ 0.9, 1.0],
    random_state = 42
    
}

xgb_model = xgb.XGBClassifier()

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           scoring='accuracy', cv=5, verbose=2, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train)


print("Best hyperparameters:", grid_search.best_params_)



In [38]:
# choosing the parameters that I got after running the GridSearch

xgboost_params =  {'n_estimators': 200,
                   'learning_rate': 0.1,
                   'max_depth': 4,
                   'min_child_weight': 1,
                   'subsample': 0.9,
                   'random_state': 42}

In [39]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(**xgboost_params)
xgb_model.fit(X_train_tfidf, y_train)

In [40]:
# to check the performance of the Gradient Boost model on the cv and test sets

dataset_names = ['cv', 'test']
datasets = [X_cv_tfidf, X_test_tfidf]

for dataset_name, dataset in zip(dataset_names, datasets):


    y_pred = xgb_model.predict(dataset)
    data_set_name = globals()[f'nlu_{dataset_name}_set']

    accuracy = accuracy_score(data_set_name['label'], y_pred)
    f1score = f1_score(data_set_name['label'], y_pred, average='weighted')


    print(f'Dataset: {dataset_name}')
    print(f'Accuracy of the {dataset_name} set for the Gradient Boost model: {accuracy}')
    print(f'F1 Score of the {dataset_name} set for the Gradient Boost model: {f1score}')

Dataset: cv
Accuracy of the cv set for the Gradient Boost model: 0.8118195956454122
F1 Score of the cv set for the Gradient Boost model: 0.8142044261641399
Dataset: test
Accuracy of the test set for the Gradient Boost model: 0.8351477449455676
F1 Score of the test set for the Gradient Boost model: 0.83924978189685


In [41]:
# to check the model on custom text

In [42]:
custom_text = ['why do I not feel well today']

custom_text_tfidf = vectorizer.transform(custom_text)

In [43]:
pred = xgb_model.predict(custom_text_tfidf)
predicted_label = [intents[i] for i in pred]
print(f'The predicted label of the custom textof the gradient boost model is {predicted_label}')

The predicted label of the custom textof the gradient boost model is ['general_negate']


BERT model

In [44]:
# Pre-processing of the data to be used.

# the train data is nlu_train_set
#the cross-validation data is nlu_cv_set
#the test data is nlu_test_set


from datasets import DatasetDict


NLU_final_data = DatasetDict({'train' : nlu_train_set,
                              'test' : nlu_test_set,
                              'cv' : nlu_cv_set})

In [45]:
y_train = nlu_train_set["label"]
y_CV = nlu_cv_set["label"]
y_test = nlu_test_set['label']

In [46]:
# doing the necessary imports

from transformers import logging
from sklearn.metrics import classification_report
from time import time
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW
import scipy
import torch


In [47]:
# importing the metrics for evaluation

from sklearn.metrics import f1_score, accuracy_score


def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(axis=-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [48]:
# encoding the train, cv and test sets to be used to train the BERT model

model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)


def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


NLU_final_data_encoded = NLU_final_data.map(tokenize, batched=True, batch_size=None)
NLU_final_data_encoded.set_format(
    "torch",
    columns=["input_ids", "attention_mask", "label"],
)
NLU_final_data_encoded.set_format("torch")
NLU_final_data_encoded

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/23143 [00:00<?, ? examples/s]

Map:   0%|          | 0/1286 [00:00<?, ? examples/s]

Map:   0%|          | 0/1286 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'scenario', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 23143
    })
    test: Dataset({
        features: ['text', 'scenario', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1286
    })
    cv: Dataset({
        features: ['text', 'scenario', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1286
    })
})

In [49]:
# to do the computation in GPU if it is available otherwise CPU

from transformers import AutoModelForSequenceClassification

device = 'cuda' if torch.cuda.is_available() else 'cpu'

num_labels = 68
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=68
).to(device )



Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [50]:
# setting up the hyperparameter search to get the best performing model

from transformers import TrainingArguments
from transformers import Trainer

best_accuracy = 0   #initializing the best accuracy
best_hyperparameters = {}

learning_rates = [2e-5, 3e-5]
batch_sizes = [64, 128]
num_epochs = [3,4]

# looping through all combinations of hyperparameters
for lr in learning_rates:
    for batch_size in batch_sizes:
        for epochs in num_epochs:
            print(f"Training with hyperparameters for the BERT model: learning_rate={lr}, batch_size={batch_size}, epochs={epochs}")


            training_args = TrainingArguments(
                output_dir="results",
                optim="adamw_torch",
                learning_rate=lr,
                weight_decay=0.01,
                per_device_train_batch_size=batch_size,
                num_train_epochs=epochs,
                load_best_model_at_end=True,
                metric_for_best_model="f1",
                evaluation_strategy="epoch",
                save_strategy="epoch",
                disable_tqdm=False,
                logging_steps=len(NLU_final_data_encoded['train']) // batch_size,
                seed = 42
            )


            trainer = Trainer(
                model=model,
                args=training_args,
                compute_metrics=compute_metrics,
                train_dataset=NLU_final_data_encoded["train"],
                eval_dataset=NLU_final_data_encoded["cv"],
            )


            trainer.train()

            eval_result = trainer.evaluate()
            current_accuracy = eval_result['eval_accuracy']
            if current_accuracy > best_accuracy:
                best_accuracy = current_accuracy
                best_hyperparameters = {
                    'learning_rate': lr,
                    'batch_size': batch_size,
                    'epochs': epochs
                }




Training with hyperparameters for the BERT model: learning_rater=2e-05, batch_size=64, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.57,1.29349,0.765163,0.727636
2,1.0268,0.807628,0.849922,0.83595
3,0.7181,0.714993,0.866252,0.855215


Training with hyperparameters for the BERT model: learning_rater=2e-05, batch_size=64, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4981,0.508359,0.88647,0.88181
2,0.2717,0.446562,0.895023,0.893405
3,0.2033,0.423817,0.907465,0.906253
4,0.1696,0.422216,0.904355,0.903223


Training with hyperparameters for the BERT model: learning_rater=2e-05, batch_size=128, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1789,0.430854,0.902022,0.900429
2,0.1288,0.417067,0.908243,0.908202
3,0.099,0.425676,0.902799,0.902319


Training with hyperparameters for the BERT model: learning_rater=2e-05, batch_size=128, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0762,0.447875,0.90902,0.908665
2,0.0575,0.464219,0.907465,0.907492
3,0.0599,0.458019,0.906687,0.90649
4,0.0467,0.451649,0.908243,0.90797


Training with hyperparameters for the BERT model: learning_rater=3e-05, batch_size=64, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0734,0.507789,0.893468,0.893282
2,0.0441,0.472523,0.909798,0.909519
3,0.027,0.471864,0.907465,0.907258


Training with hyperparameters for the BERT model: learning_rater=3e-05, batch_size=64, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0313,0.528732,0.896579,0.897088
2,0.0229,0.530356,0.902799,0.904227
3,0.0226,0.536208,0.905132,0.905385
4,0.0202,0.518417,0.906687,0.906597


Training with hyperparameters for the BERT model: learning_rater=3e-05, batch_size=128, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0156,0.580062,0.905132,0.905378
2,0.0186,0.573441,0.902022,0.901916
3,0.0144,0.56497,0.908243,0.908328


Training with hyperparameters for the BERT model: learning_rater=3e-05, batch_size=128, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0093,0.680673,0.902022,0.901911
2,0.0132,0.63256,0.900467,0.899979
3,0.0142,0.594094,0.913686,0.914734
4,0.0108,0.593797,0.912908,0.914182


In [51]:
# to compute the probability each label will take

NLU_test_data = NLU_final_data['test']['text']
input_ids = tokenizer(NLU_test_data, padding = True, truncation = True, return_tensors = 'pt')

with torch.no_grad() :
  inputs = input_ids.to('cuda')
  outputs = model(**inputs)
  logits = outputs.logits

probs = torch.softmax(logits, dim=1).cpu().numpy()
probs

array([[2.9288361e-05, 8.2082908e-05, 4.9363560e-05, ..., 5.6012468e-05,
        2.5974246e-05, 6.6935550e-05],
       [7.5163349e-08, 3.8875319e-07, 1.3829178e-07, ..., 1.9397735e-08,
        1.5253739e-06, 8.7174004e-07],
       [9.1109655e-07, 8.5147036e-07, 1.4299515e-07, ..., 7.4451430e-07,
        3.5354483e-06, 8.1604339e-06],
       ...,
       [2.9364865e-06, 4.2304539e-05, 3.7195237e-05, ..., 4.1376631e-04,
        1.2891278e-04, 1.2446867e-04],
       [5.3619736e-08, 6.0721044e-07, 7.8242183e-06, ..., 8.1965197e-08,
        2.6184972e-07, 8.3752684e-07],
       [1.2728000e-04, 2.1934109e-05, 1.1491073e-04, ..., 9.7399294e-05,
        3.4255052e-05, 2.3728065e-04]], dtype=float32)

In [52]:
# to take the labels which has given the highest probabibility

predicted_labels = np.argmax(probs, axis = 1)

In [53]:
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(y_test, predicted_labels)
f1score = f1_score(y_test, predicted_labels, average='weighted')

print(f'Accuracy of the test set of the hypertuned BERT model: {accuracy}')
print(f'F1 Score of the test set of the hypertuned BERT model: {f1score}')


Accuracy of the test set of the hypertuned BERT model: 0.9082426127527217
F1 Score of the test set of the hypertuned BERT model: 0.9081904792376323


In [54]:
# my label to intent dictionary

label_to_intent = {
    0: 'alarm_query',
    1: 'alarm_remove',
    2: 'alarm_set',
    3: 'audio_volume_down',
    4: 'audio_volume_mute',
    5: 'audio_volume_other',
    6: 'audio_volume_up',
    7: 'calendar_query',
    8: 'calendar_remove',
    9: 'calendar_set',
    10: 'cooking_query',
    11: 'cooking_recipe',
    12: 'datetime_convert',
    13: 'datetime_query',
    14: 'email_addcontact',
    15: 'email_query',
    16: 'email_querycontact',
    17: 'email_sendemail',
    18: 'general_affirm',
    19: 'general_commandstop',
    20: 'general_confirm',
    21: 'general_dontcare',
    22: 'general_explain',
    23: 'general_greet',
    24: 'general_joke',
    25: 'general_negate',
    26: 'general_praise',
    27: 'general_quirky',
    28: 'general_repeat',
    29: 'iot_cleaning',
    30: 'iot_coffee',
    31: 'iot_hue_lightchange',
    32: 'iot_hue_lightdim',
    33: 'iot_hue_lightoff',
    34: 'iot_hue_lighton',
    35: 'iot_hue_lightup',
    36: 'iot_wemo_off',
    37: 'iot_wemo_on',
    38: 'lists_createoradd',
    39: 'lists_query',
    40: 'lists_remove',
    41: 'music_dislikeness',
    42: 'music_likeness',
    43: 'music_query',
    44: 'music_settings',
    45: 'news_query',
    46: 'play_audiobook',
    47: 'play_game',
    48: 'play_music',
    49: 'play_podcasts',
    50: 'play_radio',
    51: 'qa_currency',
    52: 'qa_definition',
    53: 'qa_factoid',
    54: 'qa_maths',
    55: 'qa_stock',
    56: 'recommendation_events',
    57: 'recommendation_locations',
    58: 'recommendation_movies',
    59: 'social_post',
    60: 'social_query',
    61: 'takeaway_order',
    62: 'takeaway_query',
    63: 'transport_query',
    64: 'transport_taxi',
    65: 'transport_ticket',
    66: 'transport_traffic',
    67: 'weather_query'
}


In [55]:
# checking the model performance on custom text

custom_text = 'why do I not feel well today'


custom_text_input_tensor = tokenizer.encode(custom_text, return_tensors="pt").to(device)
with torch.no_grad():
    logits_custom_text = model(custom_text_input_tensor).logits.cpu()

prob_custom_text = scipy.special.softmax(logits_custom_text.flatten())

prob_labels = np.argmax(prob_custom_text)

custom_text_intent = label_to_intent[prob_labels]
print(f'The intent of the custom text of the BERT model is {custom_text_intent}')



The intent of the custom text of the BERT model is general_quirky


In [56]:
# to train the dataset on the ROBERTA model

ROBERTA model

In [57]:

model_name_roberta = 'roberta-base'

tokenizer = AutoTokenizer.from_pretrained(model_name_roberta)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


NLU_final_data_encoded = NLU_final_data.map(tokenize, batched=True, batch_size=None)
NLU_final_data_encoded.set_format(
    "torch",
    columns=["input_ids", "attention_mask", "label"],
)
NLU_final_data_encoded.set_format("torch")
NLU_final_data_encoded

num_labels = 68



Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/23143 [00:00<?, ? examples/s]

Map:   0%|          | 0/1286 [00:00<?, ? examples/s]

Map:   0%|          | 0/1286 [00:00<?, ? examples/s]

In [58]:
from transformers import AutoModelForSequenceClassification

num_labels = 68
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = AutoModelForSequenceClassification.from_pretrained(
    model_name_roberta, num_labels=68
).to(device )

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [59]:
# setting up the hyperparameter search to get the best performing model

from transformers import TrainingArguments
from transformers import Trainer

best_accuracy = 0   #initializing the best accuracy
best_hyperparameters = {}

learning_rates = [2e-5, 3e-5]
batch_sizes = [64, 128]
num_epochs = [3,4]

# looping through all combinations of hyperparameters
for lr in learning_rates:
    for batch_size in batch_sizes:
        for epochs in num_epochs:
            print(f"Training with hyperparameters for the ROBERTA model: learning_rate={lr}, batch_size={batch_size}, epochs={epochs}")


            training_args = TrainingArguments(
                output_dir="results",
                optim="adamw_torch",
                learning_rate=lr,
                weight_decay=0.01,
                per_device_train_batch_size=batch_size,
                num_train_epochs=epochs,
                load_best_model_at_end=True,
                metric_for_best_model="f1",
                evaluation_strategy="epoch",
                save_strategy="epoch",
                disable_tqdm=False,
                logging_steps=len(NLU_final_data_encoded['train']) // batch_size,
                seed = 42
            )


            trainer = Trainer(
                model=model,
                args=training_args,
                compute_metrics=compute_metrics,
                train_dataset=NLU_final_data_encoded["train"],
                eval_dataset=NLU_final_data_encoded["cv"],
            )


            trainer.train()

            eval_result = trainer.evaluate()
            current_accuracy = eval_result['eval_accuracy']
            if current_accuracy > best_accuracy:
                best_accuracy = current_accuracy
                best_hyperparameters = {
                    'learning_rate': lr,
                    'batch_size': batch_size,
                    'epochs': epochs
                }

Training with hyperparameters for the ROBERTA model: learning_rater=2e-05, batch_size=64, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.1636,0.934428,0.810264,0.787348
2,0.8035,0.638881,0.86703,0.861303
3,0.5805,0.573454,0.878694,0.872752


Training with hyperparameters for the ROBERTA model: learning_rater=2e-05, batch_size=64, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4321,0.495748,0.880249,0.878274
2,0.2648,0.46617,0.890358,0.890631
3,0.2204,0.417205,0.904355,0.90312
4,0.1937,0.421296,0.907465,0.907263


Training with hyperparameters for the ROBERTA model: learning_rater=2e-05, batch_size=128, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2078,0.458161,0.895801,0.896521
2,0.1555,0.442827,0.900467,0.900282
3,0.1267,0.432137,0.905132,0.904635


Training with hyperparameters for the ROBERTA model: learning_rater=2e-05, batch_size=128, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0923,0.505414,0.899689,0.900394
2,0.0753,0.49263,0.901244,0.901447
3,0.073,0.474802,0.902799,0.902524
4,0.0703,0.486216,0.902799,0.902269


Training with hyperparameters for the ROBERTA model: learning_rater=3e-05, batch_size=64, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0986,0.543857,0.895801,0.896029
2,0.0633,0.555241,0.893468,0.892605
3,0.0421,0.558265,0.898134,0.897153


Training with hyperparameters for the ROBERTA model: learning_rater=3e-05, batch_size=64, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0521,0.620354,0.895023,0.893578
2,0.0432,0.657987,0.893468,0.893515
3,0.0325,0.65533,0.900467,0.899706
4,0.0329,0.665904,0.901244,0.900526


Training with hyperparameters for the ROBERTA model: learning_rater=3e-05, batch_size=128, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0345,0.64419,0.896579,0.895407
2,0.0273,0.632702,0.904355,0.903667
3,0.0176,0.638825,0.907465,0.906716


Training with hyperparameters for the ROBERTA model: learning_rater=3e-05, batch_size=128, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0259,0.649273,0.899689,0.898937
2,0.0197,0.663512,0.902022,0.900546
3,0.0147,0.68317,0.901244,0.900323
4,0.023,0.661597,0.903577,0.902552


In [60]:
# evaluating the model performance on the test set

NLU_test_data = NLU_final_data['test']['text']
input_ids = tokenizer(NLU_test_data, padding = True, truncation = True, return_tensors = 'pt')

with torch.no_grad() :
  inputs = input_ids.to('cuda')
  outputs = model(**inputs)
  logits = outputs.logits

probs_roberta = torch.softmax(logits, dim=1).cpu().numpy()
probs_roberta

array([[7.7173229e-07, 3.9820261e-06, 6.7714416e-07, ..., 1.1139392e-05,
        1.1969215e-06, 9.1098855e-07],
       [1.1910334e-07, 2.8949248e-07, 6.7507915e-07, ..., 6.3895465e-07,
        8.2183021e-07, 3.8579124e-06],
       [5.4293105e-07, 1.5658902e-07, 1.1185957e-06, ..., 8.7216910e-07,
        7.9133463e-07, 3.7681386e-06],
       ...,
       [4.8760812e-05, 7.2194953e-06, 4.2862564e-05, ..., 2.8436115e-02,
        2.2021051e-04, 6.2475556e-05],
       [1.1930064e-06, 2.4697977e-07, 3.8071132e-06, ..., 1.0062316e-06,
        8.7512677e-07, 1.4321280e-06],
       [1.3330356e-03, 9.1929157e-04, 3.3979974e-04, ..., 4.2970278e-04,
        2.3144262e-03, 1.2881192e-04]], dtype=float32)

In [61]:
predicted_labels_roberta = np.argmax(probs_roberta, axis=1)

In [62]:
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(y_test, predicted_labels_roberta)
f1score = f1_score(y_test, predicted_labels_roberta, average='weighted')

print(f'Accuracy of the test set of the ROBERTA model: {accuracy}')
print(f'F1 Score of the test set of the ROBERTA model: {f1score}')

Accuracy of the test set of the ROBERTA model: 0.9004665629860031
F1 Score of the test set of the ROBERTA model: 0.9017549959877856


In [63]:
# checking the model performance on custom text

custom_text = 'why do i not feel well today'


custom_text_input_tensor = tokenizer.encode(custom_text, return_tensors="pt").to(device)
with torch.no_grad():
    logits_custom_text = model(custom_text_input_tensor).logits.cpu()

prob_custom_text = scipy.special.softmax(logits_custom_text.flatten())

prob_labels = np.argmax(prob_custom_text)

custom_text_intent = label_to_intent[prob_labels]

print(f'The intent of the custom text of the ROBERTA model is {custom_text_intent}')


The intent of the custom text of the ROBERTA model is general_quirky


In [64]:
# to train the dataset on XLNet base model

Distilbert model


In [65]:

model_name = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


NLU_final_data_encoded = NLU_final_data.map(tokenize, batched=True, batch_size=None)
NLU_final_data_encoded.set_format(
    "torch",
    columns=["input_ids", "attention_mask", "label"],
)
NLU_final_data_encoded.set_format("torch")
NLU_final_data_encoded

num_labels = 68



Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/23143 [00:00<?, ? examples/s]

Map:   0%|          | 0/1286 [00:00<?, ? examples/s]

Map:   0%|          | 0/1286 [00:00<?, ? examples/s]

In [66]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=68
).to(device )

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [67]:
# setting up the hyperparameter search to get the best performing model

from transformers import TrainingArguments
from transformers import Trainer

best_accuracy = 0   #initializing the best accuracy
best_hyperparameters = {}

learning_rates = [2e-5, 3e-5]
batch_sizes = [64, 128]
num_epochs = [3,4]

# looping through all combinations of hyperparameters
for lr in learning_rates:
    for batch_size in batch_sizes:
        for epochs in num_epochs:
            print(f"Training with hyperparameters for the DISTILBERT model: learning_rater={lr}, batch_size={batch_size}, epochs={epochs}")


            training_args = TrainingArguments(
                output_dir="results",
                optim="adamw_torch",
                learning_rate=lr,
                weight_decay=0.01,
                per_device_train_batch_size=batch_size,
                num_train_epochs=epochs,
                load_best_model_at_end=True,
                metric_for_best_model="f1",
                evaluation_strategy="epoch",
                save_strategy="epoch",
                disable_tqdm=False,
                logging_steps=len(NLU_final_data_encoded['train']) // batch_size,
                seed = 42
            )


            trainer = Trainer(
                model=model,
                args=training_args,
                compute_metrics=compute_metrics,
                train_dataset=NLU_final_data_encoded["train"],
                eval_dataset=NLU_final_data_encoded["cv"],
            )


            trainer.train()

            eval_result = trainer.evaluate()
            current_accuracy = eval_result['eval_accuracy']
            if current_accuracy > best_accuracy:
                best_accuracy = current_accuracy
                best_hyperparameters = {
                    'learning_rate': lr,
                    'batch_size': batch_size,
                    'epochs': epochs
                }

Training with hyperparameters for the DISTILBERT model: learning_rater=2e-05, batch_size=64, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.4651,1.237634,0.761275,0.723777
2,1.0018,0.775184,0.838258,0.821267
3,0.7056,0.683117,0.853033,0.841578


Training with hyperparameters for the DISTILBERT model: learning_rater=2e-05, batch_size=64, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5007,0.488247,0.871695,0.868471
2,0.3005,0.425858,0.884914,0.884101
3,0.2268,0.395016,0.900467,0.899682
4,0.1916,0.398922,0.898134,0.89768


Training with hyperparameters for the DISTILBERT model: learning_rater=2e-05, batch_size=128, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2019,0.409314,0.895023,0.893776
2,0.1485,0.399746,0.901244,0.901697
3,0.1186,0.401432,0.903577,0.903806


Training with hyperparameters for the DISTILBERT model: learning_rater=2e-05, batch_size=128, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0911,0.429932,0.898134,0.898608
2,0.0686,0.426076,0.904355,0.905068
3,0.0568,0.427224,0.902799,0.903126
4,0.0531,0.433284,0.90591,0.906385


Training with hyperparameters for the DISTILBERT model: learning_rater=3e-05, batch_size=64, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0615,0.489609,0.894246,0.895165
2,0.0377,0.494943,0.901244,0.901547
3,0.0222,0.502222,0.900467,0.901211


Training with hyperparameters for the DISTILBERT model: learning_rater=3e-05, batch_size=64, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0263,0.579242,0.891913,0.89275
2,0.0173,0.564032,0.897356,0.89841
3,0.0175,0.564743,0.903577,0.904345
4,0.0159,0.557454,0.904355,0.905312


Training with hyperparameters for the DISTILBERT model: learning_rater=3e-05, batch_size=128, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0143,0.602701,0.902799,0.903382
2,0.0106,0.604896,0.899689,0.900044
3,0.0073,0.573181,0.903577,0.904258


Training with hyperparameters for the DISTILBERT model: learning_rater=3e-05, batch_size=128, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0076,0.674107,0.902022,0.902286
2,0.0075,0.642919,0.902022,0.902024
3,0.005,0.645105,0.905132,0.906189
4,0.0062,0.631079,0.909798,0.910306


In [68]:
# evaluating the model performance on the test set

NLU_test_data = NLU_final_data['test']['text']
input_ids = tokenizer(NLU_test_data, padding = True, truncation = True, return_tensors = 'pt')

with torch.no_grad() :
  inputs = input_ids.to('cuda')
  outputs = model(**inputs)
  logits = outputs.logits

probs = torch.softmax(logits, dim=1).cpu().numpy()

predicted_labels = np.argmax(probs, axis=1)

In [69]:
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(y_test, predicted_labels)
f1score = f1_score(y_test, predicted_labels, average='weighted')

print(f'Accuracy of the test set for the Distillbert model: {accuracy}')
print(f'F1 Score of the test set for the Distillbert model: {f1score}')

Accuracy of the test set for the Distillbert model: 0.9105754276827371
F1 Score of the test set for the Distillbert model: 0.9117453024964791


In [70]:
# checking the model performance on custom text

custom_text = 'why do i not feel well today'


custom_text_input_tensor = tokenizer.encode(custom_text, return_tensors="pt").to(device)
with torch.no_grad():
    logits_custom_text = model(custom_text_input_tensor).logits.cpu()

prob_custom_text = scipy.special.softmax(logits_custom_text.flatten())

prob_labels = np.argmax(prob_custom_text)

custom_text_intent = label_to_intent[prob_labels]

print(f'The intent of the custom text of the DISTILBERT model is {custom_text_intent}')


The intent of the custom text of the DISTILBERT model is general_quirky


XLNet model

In [71]:

model_name = 'XLNet-base-cased'

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


NLU_final_data_encoded = NLU_final_data.map(tokenize, batched=True, batch_size=None)
NLU_final_data_encoded.set_format(
    "torch",
    columns=["input_ids", "attention_mask", "label"],
)
NLU_final_data_encoded.set_format("torch")
NLU_final_data_encoded

num_labels = 68

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Map:   0%|          | 0/23143 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1286 [00:00<?, ? examples/s]

Map:   0%|          | 0/1286 [00:00<?, ? examples/s]

In [72]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=68
).to(device )

Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at XLNet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at XLNet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [73]:
# setting up the hyperparameter search to get the best performing model for the XLNet model

from transformers import TrainingArguments
from transformers import Trainer

best_accuracy = 0   #initializing the best accuracy
best_hyperparameters = {}

learning_rates = [2e-5, 3e-5]
batch_sizes = [64, 128]
num_epochs = [3,4]

# looping through all combinations of hyperparameters
for lr in learning_rates:
    for batch_size in batch_sizes:
        for epochs in num_epochs:
            print(f"Training with hyperparameters for the XLNet model: learning_rater={lr}, batch_size={batch_size}, epochs={epochs}")


            training_args = TrainingArguments(
                output_dir="results",
                optim="adamw_torch",
                learning_rate=lr,
                weight_decay=0.01,
                per_device_train_batch_size=batch_size,
                num_train_epochs=epochs,
                load_best_model_at_end=True,
                metric_for_best_model="f1",
                evaluation_strategy="epoch",
                save_strategy="epoch",
                disable_tqdm=False,
                logging_steps=len(NLU_final_data_encoded['train']) // batch_size,
                seed = 42
            )


            trainer = Trainer(
                model=model,
                args=training_args,
                compute_metrics=compute_metrics,
                train_dataset=NLU_final_data_encoded["train"],
                eval_dataset=NLU_final_data_encoded["cv"],
            )


            trainer.train()

            eval_result = trainer.evaluate()
            current_accuracy = eval_result['eval_accuracy']
            if current_accuracy > best_accuracy:
                best_accuracy = current_accuracy
                best_hyperparameters = {
                    'learning_rate': lr,
                    'batch_size': batch_size,
                    'epochs': epochs
                }

Training with hyperparameters for the XLNet model: learning_rater=2e-05, batch_size=64, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.3147,0.868951,0.783048,0.757704
2,0.7393,0.584589,0.857698,0.851386
3,0.5243,0.534224,0.87014,0.864734


Training with hyperparameters for the XLNet model: learning_rater=2e-05, batch_size=64, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4197,0.499519,0.877916,0.876263
2,0.2438,0.497733,0.884914,0.883613
3,0.2195,0.467451,0.891135,0.888842
4,0.2143,0.460618,0.897356,0.895736


Training with hyperparameters for the XLNet model: learning_rater=2e-05, batch_size=128, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2313,0.460146,0.899689,0.899833
2,0.1758,0.460772,0.893468,0.892479
3,0.1434,0.450092,0.898911,0.897733


Training with hyperparameters for the XLNet model: learning_rater=2e-05, batch_size=128, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0912,0.500191,0.896579,0.896797
2,0.1592,0.475927,0.894246,0.894134
3,0.1238,0.477505,0.899689,0.898285
4,0.1003,0.478965,0.898911,0.897244


Training with hyperparameters for the XLNet model: learning_rater=3e-05, batch_size=64, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0883,0.567693,0.893468,0.89392
2,0.0565,0.574793,0.895023,0.895223
3,0.0393,0.588452,0.897356,0.89604


Training with hyperparameters for the XLNet model: learning_rater=3e-05, batch_size=64, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.046,0.668768,0.891913,0.891432
2,0.0294,0.663145,0.895023,0.895373
3,0.034,0.668325,0.897356,0.896045
4,0.0492,0.662185,0.900467,0.899159


Training with hyperparameters for the XLNet model: learning_rater=3e-05, batch_size=128, epochs=3


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0303,0.695028,0.891913,0.891641
2,0.0375,0.660779,0.891913,0.890382
3,0.0296,0.647256,0.901244,0.899691


Training with hyperparameters for the XLNet model: learning_rater=3e-05, batch_size=128, epochs=4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0201,0.808743,0.879471,0.878715
2,0.0259,0.708855,0.895801,0.894854
3,0.0264,0.656722,0.902022,0.902174
4,0.0355,0.656486,0.902799,0.901192


In [74]:
# evaluating the model performance on the test set

NLU_test_data = NLU_final_data['test']['text']
input_ids = tokenizer(NLU_test_data, padding = True, truncation = True, return_tensors = 'pt')

with torch.no_grad() :
  inputs = input_ids.to('cuda')
  outputs = model(**inputs)
  logits = outputs.logits

probs = torch.softmax(logits, dim=1).cpu().numpy()

predicted_labels = np.argmax(probs, axis=1)

In [75]:
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(y_test, predicted_labels)
f1score = f1_score(y_test, predicted_labels, average='weighted')

print(f'Accuracy of the test set for the XLNet model: {accuracy}')
print(f'F1 Score of the test set for the XLNet model: {f1score}')

Accuracy of the test set for the XLNet model: 0.9020217729393468
F1 Score of the test set for the XLNet model: 0.90253669144688


In [76]:
# checking the model performance on custom text

custom_text = 'why do i not feel well today'


custom_text_input_tensor = tokenizer.encode(custom_text, return_tensors="pt").to(device)
with torch.no_grad():
    logits_custom_text = model(custom_text_input_tensor).logits.cpu()

prob_custom_text = scipy.special.softmax(logits_custom_text.flatten())

prob_labels = np.argmax(prob_custom_text)

custom_text_intent = label_to_intent[prob_labels]

print(f'The intent of the custom text of the XLNet model is {custom_text_intent}')


The intent of the custom text of the XLNet model is general_quirky
