<a href="https://colab.research.google.com/github/RehanKhn/Boosting-Buddy-A-Conversational-Agent-for-Improvement-of-Mental-Health/blob/main/IntentClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
nltk.download("stopwords")
nltk.download("punkt")
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded = drive.CreateFile({'id':'1zimqy02if1n1DwDTPGMkzZyHsJsYIeqI'}) 
downloaded.GetContentFile('Dataset.csv')

In [None]:
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])  
  return (intent, unique_intent, sentences)

In [None]:
intent, unique_intent, sentences = load_dataset("Dataset.csv")

In [None]:
stemmer = LancasterStemmer()

In [None]:
def cleaning(sentences):
  words = []
  for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
    w = word_tokenize(clean)
    #stemming
    words.append([i.lower() for i in w])
    
  return words  

In [None]:
cleaned_words = cleaning(sentences)

In [None]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [None]:
def max_length(words):
  return(len(max(words, key = len)))

In [None]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

In [None]:
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [None]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [None]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [None]:
padded_doc = padding_doc(encoded_doc, max_length)

In [None]:
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

In [None]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [None]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [None]:
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [None]:
output_one_hot = one_hot(encoded_output)

In [None]:
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)

In [None]:
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
  model.add(Bidirectional(LSTM(128)))
  model.add(Dense(32, activation = "relu"))
  model.add(Dropout(0.5))
  model.add(Dense(21, activation = "softmax"))
  
  return model

In [None]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])

In [None]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint]
)

Epoch 1/100

Epoch 00001: val_loss improved from inf to 2.85028, saving model to model.h5
Epoch 2/100

Epoch 00002: val_loss improved from 2.85028 to 2.75603, saving model to model.h5
Epoch 3/100

Epoch 00003: val_loss improved from 2.75603 to 2.69385, saving model to model.h5
Epoch 4/100

Epoch 00004: val_loss improved from 2.69385 to 2.57277, saving model to model.h5
Epoch 5/100

Epoch 00005: val_loss improved from 2.57277 to 2.46831, saving model to model.h5
Epoch 6/100

Epoch 00006: val_loss improved from 2.46831 to 2.40370, saving model to model.h5
Epoch 7/100

Epoch 00007: val_loss improved from 2.40370 to 2.29871, saving model to model.h5
Epoch 8/100

Epoch 00008: val_loss improved from 2.29871 to 2.10369, saving model to model.h5
Epoch 9/100

Epoch 00009: val_loss improved from 2.10369 to 2.03839, saving model to model.h5
Epoch 10/100

Epoch 00010: val_loss improved from 2.03839 to 1.89878, saving model to model.h5
Epoch 11/100

Epoch 00011: val_loss improved from 1.89878 to 1.

In [None]:
train_acc = model.evaluate(train_X, train_Y, verbose=1)
val_acc = model.evaluate(val_X, val_Y, verbose=1)
print(train_acc)
print(val_acc)

[0.3277706503868103, 0.9134831428527832]
[0.8776749968528748, 0.7847533822059631]


In [None]:
model = load_model("model.h5")
def predictions(text):
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean)
  test_word = [w.lower() for w in test_word]
  test_ls = word_tokenizer.texts_to_sequences(test_word)
  print(test_word)
  

  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_length)
  
  pred = model.predict(x)
  
  return pred

In [None]:
def get_final_output(pred, classes):
  predictions = pred[0]
  
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
  outputPred = predictions[1]
  for i in range(pred.shape[1]):
    if(outputPred <= predictions[i]):
      outputPred = predictions[i]
  outputClass = classes[np.where(predictions == outputPred)[0][0]]
  print("Class: ",outputClass)

In [None]:
text = input("Enter Sentence: ")
pred = predictions(text)
get_final_output(pred, unique_intent)

In [None]:
a