<a href="https://colab.research.google.com/github/SupriaBasak99/Multi-intent-classification-in-chatbots/blob/main/1_Naive_bayes_for_FYP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd 
import numpy as np
import nltk
from nltk.tokenize import word_tokenize 
import re
import sklearn
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
nltk.download("stopwords")
nltk.download("punkt")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "latin1", names = ["Intent", "Sentence"])
  print(df.head())
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)
  

In [None]:
intent, unique_intent, sentences = load_dataset("/content/drive/MyDrive/atis_intents_train.csv") 

             Intent                                           Sentence
0            intent                                           sentence
1       atis_flight   what flights are available from pittsburgh to...
2  atis_flight_time   what is the arrival time in san francisco for...
3      atis_airfare            cheapest airfare from tacoma to orlando
4      atis_airfare   round trip fares from pittsburgh to philadelp...


In [None]:
print(unique_intent[:5])

['atis_flight', 'atis_quantity', 'atis_aircraft', 'atis_ground_service', 'atis_flight_time']


In [None]:
def cleaning(sentences):
  words = []
  for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
    w = word_tokenize(clean)
    #stemming
    words.append([i.lower() for i in w])
    
  return words 

In [None]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2]) 

4834
[['sentence'], ['what', 'flights', 'are', 'available', 'from', 'pittsburgh', 'to', 'baltimore', 'on', 'thursday', 'morning']]


In [None]:
train_X, val_X, train_Y, val_Y = train_test_split(sentences, intent, shuffle = True, test_size = 0.1)

In [None]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [None]:
def max_length(words):
  return(len(max(words, key = len)))

In [None]:
print("Data: ", train_X[0])
print("Target: ", train_Y[0])

Data:   i want to fly from boston to san francisco
Target:  intent


In [None]:
def create_model_binomial_nb():
  vect = CountVectorizer().fit(train_X)
  train_X_vectorized = vect.transform(train_X)
  model = BernoulliNB()
  model.fit(train_X_vectorized, train_Y)
  preds = model.predict(vect.transform(val_X))
  score = accuracy_score(val_Y, preds)
  print("Accuracy: ", score)
  f1 = f1_score(val_Y, preds, average='macro')
  print("Macro F1 Score:", f1)
  f1 = f1_score(val_Y, preds, average='micro')
  print("Micro F1 Score:", f1)
  f1 = f1_score(val_Y, preds, average='weighted')
  print("Weighted F1 Score:", f1, "\n")
  return model

In [None]:
def create_model_multinomial_nb():
  vect = CountVectorizer().fit(train_X)
  train_X_vectorized = vect.transform(train_X)
  model = MultinomialNB()
  model.fit(train_X_vectorized, train_Y)
  preds = model.predict(vect.transform(val_X))
  score = accuracy_score(val_Y, preds)
  print("Accuracy: ", score)
  f1 = f1_score(val_Y, preds, average='macro')
  print("Macro F1 Score:", f1)
  f1 = f1_score(val_Y, preds, average='micro')
  print("Micro F1 Score:", f1)
  f1 = f1_score(val_Y, preds, average='weighted')
  print("Weighted F1 Score:", f1, "\n")
  return model

In [None]:
def create_model_gaussian_nb():
  vect = CountVectorizer().fit(train_X)
  train_X_vectorized = vect.transform(train_X)
  model = GaussianNB()
  model.fit(train_X_vectorized.toarray(), train_Y)
  preds = model.predict(vect.transform(val_X).toarray())
  score = accuracy_score(val_Y, preds)
  print("Accuracy: ", score)
  f1 = f1_score(val_Y, preds, average='macro')
  print("Macro F1 Score:", f1)
  f1 = f1_score(val_Y, preds, average='micro')
  print("Micro F1 Score:", f1)
  f1 = f1_score(val_Y, preds, average='weighted')
  print("Weighted F1 Score:", f1, "\n")
  return model

In [None]:
print("Binomial Naive Bayes\n")
model = create_model_binomial_nb()
#Precting first entry
#print(model.predict(np.reshape(val_X[0], (1, len(val_X[0])))))
print("Multinomial Naive Bayes\n")
model = create_model_multinomial_nb()
#Precting first entry
#print(model.predict(np.reshape(val_X[0], (1, len(val_X[0])))))
print("Gaussian Naive Bayes\n")
model = create_model_gaussian_nb()
#Precting first entry
#print(model.predict(np.reshape(val_X[0], (1, len(val_X[0])))))


Binomial Naive Bayes

Accuracy:  0.9070247933884298
Macro F1 Score: 0.6572863189469681
Micro F1 Score: 0.9070247933884298
Weighted F1 Score: 0.8995287824270262 

Multinomial Naive Bayes

Accuracy:  0.9111570247933884
Macro F1 Score: 0.683132178667893
Micro F1 Score: 0.9111570247933884
Weighted F1 Score: 0.9041409905193848 

Gaussian Naive Bayes

Accuracy:  0.4669421487603306
Macro F1 Score: 0.4159144167761917
Micro F1 Score: 0.4669421487603306
Weighted F1 Score: 0.5292731465841312 

