<a href="https://colab.research.google.com/github/RehanKhn/Boosting-Buddy-A-Conversational-Agent-for-Improvement-of-Mental-Health/blob/main/IntentClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint

In [2]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
downloaded = drive.CreateFile({'id':'1zimqy02if1n1DwDTPGMkzZyHsJsYIeqI'}) 
downloaded.GetContentFile('Dataset.csv')

In [4]:
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])  
  return (intent, unique_intent, sentences)

In [5]:
intent, unique_intent, sentences = load_dataset("Dataset.csv")

In [6]:
stemmer = LancasterStemmer()

In [7]:
def cleaning(sentences):
  words = []
  for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
    w = word_tokenize(clean)
    #stemming
    words.append([i.lower() for i in w])
    
  return words  

In [8]:
cleaned_words = cleaning(sentences)

In [9]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [10]:
def max_length(words):
  return(len(max(words, key = len)))

In [11]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

In [12]:
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [13]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [14]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [15]:
padded_doc = padding_doc(encoded_doc, max_length)

In [16]:
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

In [17]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [18]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [19]:
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [20]:
output_one_hot = one_hot(encoded_output)

In [21]:
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)

In [22]:
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
  model.add(Bidirectional(LSTM(128)))
  model.add(Dense(32, activation = "relu"))
  model.add(Dropout(0.5))
  model.add(Dense(21, activation = "softmax"))
  
  return model

In [23]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])

In [24]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint]
)

Epoch 1/100

Epoch 00001: val_loss improved from inf to 2.87731, saving model to model.h5
Epoch 2/100

Epoch 00002: val_loss improved from 2.87731 to 2.84760, saving model to model.h5
Epoch 3/100

Epoch 00003: val_loss improved from 2.84760 to 2.79055, saving model to model.h5
Epoch 4/100

Epoch 00004: val_loss improved from 2.79055 to 2.74766, saving model to model.h5
Epoch 5/100

Epoch 00005: val_loss improved from 2.74766 to 2.59301, saving model to model.h5
Epoch 6/100

Epoch 00006: val_loss improved from 2.59301 to 2.46743, saving model to model.h5
Epoch 7/100

Epoch 00007: val_loss improved from 2.46743 to 2.40444, saving model to model.h5
Epoch 8/100

Epoch 00008: val_loss improved from 2.40444 to 2.35601, saving model to model.h5
Epoch 9/100

Epoch 00009: val_loss improved from 2.35601 to 2.32324, saving model to model.h5
Epoch 10/100

Epoch 00010: val_loss did not improve from 2.32324
Epoch 11/100

Epoch 00011: val_loss improved from 2.32324 to 2.18328, saving model to model.h

In [25]:
train_acc = model.evaluate(train_X, train_Y, verbose=1)
val_acc = model.evaluate(val_X, val_Y, verbose=1)



In [26]:
model = load_model("model.h5")
def predictions(text):
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean)
  test_word = [w.lower() for w in test_word]
  test_ls = word_tokenizer.texts_to_sequences(test_word)
  print(test_word)
  

  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_length)
  
  pred = model.predict(x)
  
  return pred

In [27]:
def get_final_output(pred, classes):
  predictions = pred[0]
  
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
  outputPred = predictions[1]
  for i in range(pred.shape[1]):
    if(outputPred <= predictions[i]):
      outputPred = predictions[i]
  outputClass = classes[np.where(predictions == outputPred)[0][0]]
  print("Class: ",outputClass)

In [28]:
text = input("Enter Sentence: ")
pred = predictions(text)
get_final_output(pred, unique_intent)

Enter Sentence: I need help.
['i', 'need', 'help']
Class:  assist
