#Import Libraries

In [33]:
!pip install datasets



In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import torch
from sklearn.metrics import classification_report, confusion_matrix

# Load Dataset

In [35]:
df = pd.read_csv('/content/intent recognition.csv')

df

Unnamed: 0,text,intent
0,add sabrina salerno to the grime instrumentals...,AddToPlaylist
1,i want to bring four people to a place that s ...,BookRestaurant
2,put lindsey cardinale into my hillary clinton ...,AddToPlaylist
3,will it snow in mt on june 13 2038,GetWeather
4,play signe anderson chant music that is newest,PlayMusic
...,...,...
695,find a tv series called armageddon summer,SearchCreativeWork
696,find politicsnation with al sharpton,SearchCreativeWork
697,rate this album 0 points out of 6,RateBook
698,add leah kauffman to my uncharted 4 nathan dra...,AddToPlaylist


In [36]:
df['intent'].value_counts()

Unnamed: 0_level_0,count
intent,Unnamed: 1_level_1
AddToPlaylist,124
SearchCreativeWork,107
SearchScreeningEvent,107
GetWeather,104
BookRestaurant,92
PlayMusic,86
RateBook,80


In [37]:
df = df.sample(n=3000,  replace=True)

In [38]:
df['intent'].value_counts()

Unnamed: 0_level_0,count
intent,Unnamed: 1_level_1
AddToPlaylist,537
SearchCreativeWork,459
GetWeather,454
SearchScreeningEvent,452
BookRestaurant,377
PlayMusic,373
RateBook,348


In [39]:
df.isnull().sum()

Unnamed: 0,0
text,0
intent,0


# Cleaning

In [40]:
df['intent'] = df['intent'].str.lower()
df

Unnamed: 0,text,intent
102,can i get the butterfly crush showings,searchscreeningevent
435,i m looking for dead at 21 the tv series,searchcreativework
270,play music on youtube,playmusic
106,for my crossfit playlist add the soul sessions...,addtoplaylist
71,play some steve boyett chant music,playmusic
...,...,...
119,add turk to the deep house playlist,addtoplaylist
436,can you make reservations at a tea house that ...,bookrestaurant
675,play the album vibrations by marion elise raven,playmusic
464,i rate the chronicle son of the tree with fou...,ratebook


In [41]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):

  # convert to lowercase
  text = text.lower()

  # remove stopwords
  text = ' '.join([word for word in text.split() if word not in stop_words])

  return text

df['clean'] = df['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
df

Unnamed: 0,text,intent,clean
102,can i get the butterfly crush showings,searchscreeningevent,get butterfly crush showings
435,i m looking for dead at 21 the tv series,searchcreativework,looking dead 21 tv series
270,play music on youtube,playmusic,play music youtube
106,for my crossfit playlist add the soul sessions...,addtoplaylist,crossfit playlist add soul sessions volume 2
71,play some steve boyett chant music,playmusic,play steve boyett chant music
...,...,...,...
119,add turk to the deep house playlist,addtoplaylist,add turk deep house playlist
436,can you make reservations at a tea house that ...,bookrestaurant,make reservations tea house serves fettucine
675,play the album vibrations by marion elise raven,playmusic,play album vibrations marion elise raven
464,i rate the chronicle son of the tree with fou...,ratebook,rate chronicle son tree four 6 points


In [43]:
df.drop(['text'], axis=1, inplace=True)

# Encode Intent

In [44]:
label = df['intent'].unique()

{label: i for i, label in enumerate(label)}


{'searchscreeningevent': 0,
 'searchcreativework': 1,
 'playmusic': 2,
 'addtoplaylist': 3,
 'ratebook': 4,
 'getweather': 5,
 'bookrestaurant': 6}

In [45]:
label = df['intent'].unique()

label_id = {label: i for i, label in enumerate(label)}

df['label'] = df['intent'].map(label_id)

In [46]:
df.head()

Unnamed: 0,intent,clean,label
102,searchscreeningevent,get butterfly crush showings,0
435,searchcreativework,looking dead 21 tv series,1
270,playmusic,play music youtube,2
106,addtoplaylist,crossfit playlist add soul sessions volume 2,3
71,playmusic,play steve boyett chant music,2


In [47]:
df.drop(['intent'], axis=1, inplace=True)

# Tokenization

In [48]:
# load model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')


In [49]:
max_length = max([len(tokenizer.encode(text)) for text in df['clean']])
max_length

19

In [50]:

def tokenize_function(dataset):
  token_input = tokenizer(dataset['clean'], padding='max_length', truncation=True, max_length=max_length)
  token_input['label'] = dataset['label']
  return token_input

dataset = Dataset.from_pandas(df[['clean', 'label']]) # Convert the dataframe into a HuggingFace Dataset

dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [51]:
dataset[1:10]

{'clean': ['looking dead 21 tv series',
  'play music youtube',
  'crossfit playlist add soul sessions volume 2',
  'play steve boyett chant music',
  'rate current novel four 6 stars',
  'textbook gets two',
  'check forecast current spot future oct 19 2037',
  'cloud coverage verdery myanmar',
  'forecast belize around meal time'],
 'label': [1, 2, 3, 2, 4, 4, 5, 5, 5],
 '__index_level_0__': [435, 270, 106, 71, 20, 614, 121, 466, 214],
 'input_ids': [[101,
   2559,
   2757,
   2538,
   2694,
   2186,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [101, 2377, 2189, 7858, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [101,
   2892,
   8873,
   2102,
   2377,
   9863,
   5587,
   3969,
   6521,
   3872,
   1016,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [101,
   2377,
   3889,
   2879,
   6582,
   16883,
   2189,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [101,
   3446,
   2783,
   3117,
   2176,
   102

In [52]:
dataset[0]

{'clean': 'get butterfly crush showings',
 'label': 0,
 '__index_level_0__': 102,
 'input_ids': [101,
  2131,
  9112,
  10188,
  4760,
  2015,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

# Fine Tunning

In [53]:
# Initialize DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=7)

# Move model to GPU
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [54]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,

)

# Trainer setup
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = dataset,
    eval_dataset = dataset,

)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.1056,0.071888
2,0.0603,0.012297
3,0.0117,0.00824


TrainOutput(global_step=564, training_loss=0.2560895457005141, metrics={'train_runtime': 64.4124, 'train_samples_per_second': 139.725, 'train_steps_per_second': 8.756, 'total_flos': 44245986318000.0, 'train_loss': 0.2560895457005141, 'epoch': 3.0})

# *Evaluation*

In [55]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate on test dataset
predictions,  labels, _ = trainer.predict(dataset)

# Convert prediction to lable
predictions_lables = predictions.argmax(axis=-1)

# Generate classification report and confusion matrix
print('Classification Report:')
print(classification_report(labels, predictions_lables))

print('Confusion Matrix:')
print(confusion_matrix(labels, predictions_lables))

# give ( ,_ )if error

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       452
           1       1.00      1.00      1.00       459
           2       1.00      1.00      1.00       373
           3       1.00      1.00      1.00       537
           4       1.00      1.00      1.00       348
           5       1.00      1.00      1.00       454
           6       1.00      1.00      1.00       377

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000

Confusion Matrix:
[[452   0   0   0   0   0   0]
 [  0 459   0   0   0   0   0]
 [  0   0 373   0   0   0   0]
 [  0   0   0 537   0   0   0]
 [  0   0   0   0 348   0   0]
 [  0   0   0   0   0 454   0]
 [  0   0   0   0   0   0 377]]


# Save Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [56]:
model.save_pretrained('/content/drive/MyDrive/GenAI Project(1-99)/Intent-Detection system || Fine-Tuning-DistilBERT/model')
tokenizer.save_pretrained('/content/drive/MyDrive/GenAI Project(1-99)/Intent-Detection system || Fine-Tuning-DistilBERT/token')


('/content/drive/MyDrive/GenAI Project(1-99)/Intent-Detection system || Fine-Tuning-DistilBERT/token/tokenizer_config.json',
 '/content/drive/MyDrive/GenAI Project(1-99)/Intent-Detection system || Fine-Tuning-DistilBERT/token/special_tokens_map.json',
 '/content/drive/MyDrive/GenAI Project(1-99)/Intent-Detection system || Fine-Tuning-DistilBERT/token/vocab.txt',
 '/content/drive/MyDrive/GenAI Project(1-99)/Intent-Detection system || Fine-Tuning-DistilBERT/token/added_tokens.json',
 '/content/drive/MyDrive/GenAI Project(1-99)/Intent-Detection system || Fine-Tuning-DistilBERT/token/tokenizer.json')

#Prediction System

In [57]:
# Load the saved model + token
model = DistilBertForSequenceClassification.from_pretrained('/content/drive/MyDrive/GenAI Project(1-99)/Intent-Detection system || Fine-Tuning-DistilBERT/model')
tokenizer = DistilBertTokenizerFast.from_pretrained('/content/drive/MyDrive/GenAI Project(1-99)/Intent-Detection system || Fine-Tuning-DistilBERT/token')

In [64]:
def predict(text, model, tokenizer, max_length=22):

  #lower case and remove stopwords
  text = preprocess_text(text)

  #tokenize
  inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

  with torch.no_grad():
    outputs = model(**inputs)
    outputs = outputs.logits

  predicted_label = torch.argmax(outputs, dim=1).item()

  return predicted_label


In [66]:
classes = {0 : 'searchscreeningevent',
 1 : 'searchcreativework',
 2 :'playmusic',
 3 : 'addtoplaylist',
 4 : 'ratebook',
 5 : 'getweather',
 6 : 'bookrestaurant'}

input = "What time does the movie start tonight?"

# Call function
predicted_label = predict(input, model, tokenizer)
# Call Classes
predicted_intent = classes.get(predicted_label)
print('Message:', input)
print('Predict_intent:', predicted_intent)

Message: What time does the movie start tonight?
Predict_intent: searchscreeningevent
