<a href="https://colab.research.google.com/github/PratikHazarika/AI-ML-Projects/blob/main/NLP/Intent%20Classification/src/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downloading packages

In [3]:
! pip install pickle-mixin.
! pip install wget

[31mERROR: Invalid requirement: 'pickle-mixin.'[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Importing required libraries

In [4]:
import wget, json, keras, pickle

import tensorflow as tf
import pandas as pd
import numpy as np

from pprint import pprint as pp

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

from keras.models import Sequential, load_model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Bidirectional, Embedding,

from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

import warnings
warnings.filterwarnings('ignore')

Mounted at /content/gdrive/


# Data preprocessing

## Loading CLINC 150 dataset

In [5]:
url = 'https://raw.githubusercontent.com/clinc/oos-eval/master/data/data_full.json'
wget.download(url)

glove ='https://www.dropbox.com/s/a247ju2qsczh0be/glove.6B.100d.txt?dl=1'
wget.download(glove)


with open('data_full.json') as file:
  sample_data = json.loads(file.read())

## Exploring the dataset

In [6]:
for data in sample_data:
  print(data)

oos_val
val
train
oos_test
test
oos_train


### Data Sizes

In [7]:
print(f"test size: {len(sample_data['test'])}")
print(f"train size: {len(sample_data['train'])}")
print(f"oos train size: {len(sample_data['oos_train'])}")
print(f"oos train size: {len(sample_data['oos_test'])}")
print(f"val: {len(sample_data['val'])}")

test size: 4500
train size: 15000
oos train size: 100
oos train size: 1000
val: 3000


### Intents 

In [12]:
intents = []

for data in sample_data['train']:
  intents.append(data[1])

print(f'Total intents in the dataset = {len(set(intents))} \n')

Total intents in the dataset = 150 



In [None]:
for intent in set(intents):
  print(intent)

### Train sentences

In [None]:
for train_sentence in sample_data['train']:
  print(train_sentence[0])

### Test sentences

In [None]:
for test_sentence in sample_data['train']:
  print(test_sentence[0])

### OOS data

In [None]:
oos_train = []

for data in sample_data['oos_train']:
  oos_train.append(data)

print(f'number of oos train values = {len((oos_train))}')

for data in oos_train:
  print(data)

In [None]:
oos_test = []

for data in sample_data['oos_test']:
  oos_test.append(data)

print(f'number of oos test values = {len((oos_test))}')

for data in oos_test:
  print(data)

# Val data

In [None]:
for data in sample_data['val']:
  print(data)

### Loading out of scope and other data
Most supervised machine learning tasks assume a dataset with a set of well-defined target intent set. But what happens when a trained model meets the real world, where inputs to the trained model might not be from the well-defined target intent set? This dataset offers a way to evaluate intent classification models on "out-of-scope" inputs.

"Out-of-scope" inputs are those that do not belong to the set of "in-scope" target intents. You may have heard other ways of referring to out-of-scope, including "out-of-domain" or "out-of-distribution".

In [19]:
data = sample_data

val_oos = np.array(data['oos_val'])
train_oos = np.array(data['oos_train'])
test_oos = np.array(data['oos_test'])

val_others = np.array(data['val'])
train_others = np.array(data['train'])
test_others = np.array(data['test'])

## Merging oos and other data

In [20]:
val = np.concatenate([val_oos, val_others])
train = np.concatenate([train_oos, train_others])
test = np.concatenate([test_oos, test_others])

data = np.concatenate([train, test, val])
data = data.T

## Splitting training and test data

In [21]:
text = data[0] 
intents = data[1] 
train_txt, test_txt, train_intent, test_intents = train_test_split(text, intents, test_size = 0.3) #saving the train and test data

# Data processing
## Tokenizing and converting words to integers

In [None]:
max_num_words = 4000
classes = np.unique(intents) #getting the unique intents

tokenizer = Tokenizer(num_words = max_num_words) # tokenizing the sentences
tokenizer.fit_on_texts(train_txt)                # create a dict and counts # occurences of each word
word_index = tokenizer.word_index                # contains a dict with unique words and their counts 

## Padding the phrases

In [None]:
ls = [len(sentence.split()) for sentence in train_txt] # creating a list of the sentence lengths    
max_len = int(np.percentile(ls, 98))                   # max len of sentence is 98th percentile of the len

train_sequences = tokenizer.texts_to_sequences(train_txt)                          # converting train text tokens of text into integers
train_sequences = pad_sequences(train_sequences, maxlen = max_len, padding='post') # padding the sequences (adding 0 when needed)

test_sequences = tokenizer.texts_to_sequences(test_txt)
test_sequences = pad_sequences(test_sequences, maxlen = max_len, padding='post')

## Encoding

In [None]:
intent_encoder = LabelEncoder() 
integer_encoded = intent_encoder.fit_transform(classes)

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoder.fit(integer_encoded)

train_intent_encoded = intent_encoder.transform(train_intent) 
train_intent_encoded = train_intent_encoded.reshape(len(train_intent_encoded), 1)
train_intent = onehot_encoder.transform(train_intent_encoded)

test_intents_encoded = intent_encoder.transform(test_intents)
test_intents_encoded = test_intents_encoded.reshape(len(test_intents_encoded), 1)
test_intents = onehot_encoder.transform(test_intents_encoded)

## Embedding
### Preparing GloVe

In [None]:
embeddings_index={} #dict to store the trained words

with open('glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

## GloVe embedding

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
num_words = min(max_num_words, len(word_index)) + 1
embedding_dim = len(embeddings_index['the'])
embedding_matrix = np.random.normal(emb_mean, emb_std, (num_words, embedding_dim))

for word, i in word_index.items():
    if i >= max_num_words:
        break
        
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Model
## Making Model

In [None]:
model = Sequential()

model.add(Embedding(num_words, 100, trainable=False, input_length=train_sequences.shape[1], weights=[embedding_matrix]))
model.add(Bidirectional(LSTM(256, return_sequences=True, recurrent_dropout=0.1, dropout=0.1), 'concat'))
model.add(Dropout(0.3)) 
model.add(LSTM(256, return_sequences=False, recurrent_dropout=0.1, dropout=0.1))
model.add(Dropout(0.3))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(classes.shape[0], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

## Training Model

In [None]:
history = model.fit(train_sequences, train_intent, epochs = 1,
          batch_size = 128, shuffle=True,
          validation_data=[test_sequences, test_intents])

## Saving Model

In [None]:
model.save('drive/MyDrive/Colab Notebooks/models/intents.h5') 

with open('drive/MyDrive/Colab Notebooks/utils/classes.pkl','wb') as file:
   pickle.dump(classes,file)

with open('drive/MyDrive/Colab Notebooks/utils/tokenizer.pkl','wb') as file:
   pickle.dump(tokenizer,file)

with open('drive/MyDrive/Colab Notebooks/utils/label_encoder.pkl','wb') as file:
   pickle.dump(label_encoder,file)

## Loading the model

In [None]:
model = load_model('gdrive/MyDrive/Colab Notebooks/models/intents.h5')

with open('gdrive/MyDrive/Colab Notebooks/utils/classes.pkl','rb') as file:
  classes = pickle.load(file)

with open('gdrive/MyDrive/Colab Notebooks/utils/tokenizer.pkl','rb') as file:
  tokenizer = pickle.load(file)

with open('gdrive/MyDrive/Colab Notebooks/utils/label_encoder.pkl','rb') as file:
  label_encoder = pickle.load(file)

# Testing

In [None]:
class IntentClassifier:
    def __init__(self,classes,model,tokenizer,label_encoder):
        self.classes = classes
        self.classifier = model
        self.tokenizer = tokenizer
        self.label_encoder = label_encoder

    def get_intent(self,text):
        self.text = [text] # input text
        self.test_keras = self.tokenizer.texts_to_sequences(self.text) # converted text to int
        self.test_keras_sequence = pad_sequences(self.test_keras, maxlen=16, padding='post') # padding the text sequence
        self.pred = self.classifier.predict(self.test_keras_sequence) # predicting the intent using padded sequence

        return self.label_encoder.inverse_transform(np.argmax(self.pred, axis=1))[0] 

    def get_probability(self, text):
        self.text = [text] # input text
        self.test_keras = self.tokenizer.texts_to_sequences(self.text) # converted text to int
        self.test_keras_sequence = pad_sequences(self.test_keras, maxlen=16, padding='post') # padding the text sequence
        self.pred = self.classifier.predict(self.test_keras_sequence) # predicting the intent using padded sequence

        sorted = np.sort(self.pred[0])
        
        return [sorted[-1], sorted[-2], sorted[-2]]

    def get_intents(self, text):
        self.text = [text] # input text
        self.test_keras = self.tokenizer.texts_to_sequences(self.text) # converted text to int
        self.test_keras_sequence = pad_sequences(self.test_keras, maxlen=16, padding='post') # padding the text sequence
        self.pred = self.classifier.predict(self.test_keras_sequence) # predicting the intent using padded sequence

        sorted = np.sort(self.pred[0])

        highest_index = np.where(self.pred[0] == sorted[-1])
        second_highest_index = np.where(self.pred[0] == sorted[-2])
        third_highest_index = np.where(self.pred[0] == sorted[-3])

        return [self.label_encoder.inverse_transform(highest_index)[0], self.label_encoder.inverse_transform(second_highest_index)[0], self.label_encoder.inverse_transform(third_highest_index)[0]]



In [None]:
nlu = IntentClassifier(classes,model,tokenizer,label_encoder)
val = sample_data['val']
sent = val[0][0]
intent = val[0][1]
threshold = 0.64

for sentence in val[:500]:
  intent = sentence[1]
  sent = sentence[0]

  if nlu.get_probability(sent)[0] > threshold:
    print(f"Sentence: {sent}")
    print(f"Intent: {intent}\n\n")

  else:
    intent = "out of scope"
    print(f"Sentence: {sent}")
    print(f"Intent: {intent}\n\n")  