# Importing the dataset and libraries



In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import spacy

from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# Getting Overview of the dataset

In [3]:
df = pd.read_csv("bitext.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8175 entries, 0 to 8174
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   flags      8175 non-null   object
 1   utterance  8175 non-null   object
 2   category   8175 non-null   object
 3   intent     8175 non-null   object
dtypes: object(4)
memory usage: 255.6+ KB


> 8175 Rows and 4 columns with No Null values

In [5]:
df.head()

Unnamed: 0,flags,utterance,category,intent
0,BM,I have problems with canceling an order,ORDER,cancel_order
1,BIM,how can I find information about canceling ord...,ORDER,cancel_order
2,B,I need help with canceling the last order,ORDER,cancel_order
3,BIP,could you help me cancelling the last order I ...,ORDER,cancel_order
4,B,problem with cancelling an order I made,ORDER,cancel_order


# Data Cleaning and Preparation

In [6]:
# Function for cleaning the sentence and removing the stop words from the sentences

def preprocess_text(text):
  lemmatizer = WordNetLemmatizer()
  stop_words = set(stopwords.words('english'))

  text = text.lower()
  text = re.sub(r'[^a-zA-Z\s]', '', text)

  words = word_tokenize(text)

  words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

  return ' '.join(words)

In [7]:
# Adding a new column 'processed_utterance' that will contain the clean sentences

df['processed_utterance'] = df['utterance'].apply(preprocess_text)

In [8]:
df.head()

Unnamed: 0,flags,utterance,category,intent,processed_utterance
0,BM,I have problems with canceling an order,ORDER,cancel_order,problem canceling order
1,BIM,how can I find information about canceling ord...,ORDER,cancel_order,find information canceling order
2,B,I need help with canceling the last order,ORDER,cancel_order,need help canceling last order
3,BIP,could you help me cancelling the last order I ...,ORDER,cancel_order,could help cancelling last order made
4,B,problem with cancelling an order I made,ORDER,cancel_order,problem cancelling order made


In [9]:
# Removing unwanted columns from the dataset
df = df[['processed_utterance', 'intent']]

In [10]:
df.head()

Unnamed: 0,processed_utterance,intent
0,problem canceling order,cancel_order
1,find information canceling order,cancel_order
2,need help canceling last order,cancel_order
3,could help cancelling last order made,cancel_order
4,problem cancelling order made,cancel_order


> After cleaning the dataset, we have to encode our labels

In [11]:
# Using Label Encoder

le = LabelEncoder()
df['intent_encoded'] = le.fit_transform(df['intent'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['intent_encoded'] = le.fit_transform(df['intent'])


In [12]:
# Creating Training and Testing datasets.

X_train, X_test, y_train, y_test = train_test_split(df['processed_utterance'],
df['intent_encoded'], test_size=0.2, random_state=42)

# Model Selection

### 1. Linear Regression

In [13]:
# Creating a pipeline for Logistic Regression

model_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression())
])

In [14]:
# Fitting the training data in model

model_lr.fit(X_train, y_train)

In [15]:
# Creating the prediction variable for Logistic Regression model

y_pred_lr = model_lr.predict(X_test)

In [16]:
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

Accuracy: 0.9932721712538226
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        62
           1       1.00      1.00      1.00        70
           2       1.00      0.98      0.99        60
           3       1.00      0.98      0.99        66
           4       1.00      1.00      1.00        63
           5       1.00      0.97      0.99        68
           6       0.98      1.00      0.99        59
           7       1.00      1.00      1.00        52
           8       0.98      1.00      0.99        61
           9       0.98      1.00      0.99        57
          10       0.98      0.94      0.96        62
          11       0.96      1.00      0.98        53
          12       0.96      1.00      0.98        55
          13       0.98      1.00      0.99        49
          14       1.00      1.00      1.00        69
          15       0.99      1.00      0.99        72
          16       1.00     

> The logistic regression model achieved an impressive accuracy of `99.33%`, with near-perfect precision, recall, and F1-scores across all classes, demonstrating strong generalization and minimal misclassification.

### 2. Support Vector Machine

In [17]:
# Creating a pipeline for SVM

model_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC(kernel='linear'))
])

In [18]:
# Fitting the training data in SVM model

model_svm.fit(X_train, y_train)

In [19]:
# Creating the prediction variable for SVM model

y_pred_svm = model_svm.predict(X_test)

In [20]:
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

Accuracy: 0.9957186544342508
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        62
           1       1.00      1.00      1.00        70
           2       1.00      1.00      1.00        60
           3       1.00      0.98      0.99        66
           4       1.00      1.00      1.00        63
           5       0.99      0.99      0.99        68
           6       1.00      1.00      1.00        59
           7       1.00      1.00      1.00        52
           8       0.98      1.00      0.99        61
           9       0.98      1.00      0.99        57
          10       1.00      0.95      0.98        62
          11       0.98      1.00      0.99        53
          12       0.98      1.00      0.99        55
          13       1.00      1.00      1.00        49
          14       1.00      1.00      1.00        69
          15       1.00      1.00      1.00        72
          16       0.98     

> The model achieved `99.57%` accuracy with perfect precision and recall for most classes, indicating exceptional performance across all categories.

### 3. Naive Bayes

In [21]:
# Creating a pipeline for Naive Bayes

model_nb = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [22]:
# Fitting the training data in Naive Bayes model

model_nb.fit(X_train, y_train)

In [23]:
# Creating the prediction variable for Naive Bayes model

y_pred_nb = model_nb.predict(X_test)

In [24]:
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

Accuracy: 0.9920489296636086
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        62
           1       1.00      1.00      1.00        70
           2       1.00      0.98      0.99        60
           3       0.98      0.98      0.98        66
           4       1.00      0.97      0.98        63
           5       1.00      0.96      0.98        68
           6       0.98      0.98      0.98        59
           7       1.00      1.00      1.00        52
           8       0.97      1.00      0.98        61
           9       1.00      1.00      1.00        57
          10       1.00      0.95      0.98        62
          11       1.00      1.00      1.00        53
          12       0.96      1.00      0.98        55
          13       1.00      1.00      1.00        49
          14       1.00      1.00      1.00        69
          15       0.97      1.00      0.99        72
          16       1.00     

> The Naive Bayes model achieved `99.20` accuracy** with high precision and recall across most classes, indicating exceptional classification performance.

### 4. Decision Tree Classifier

In [25]:
# Creating a pipeline for Decision Tree Classifier

model_dt = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dt', DecisionTreeClassifier(random_state=42))
])

In [26]:
# Fitting the training data in Decision Tree model

model_dt.fit(X_train, y_train)

In [27]:
# Creating the prediction variable for Decision Tree model

y_pred_dt = model_dt.predict(X_test)

In [28]:
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))

Accuracy: 0.980428134556575
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        62
           1       1.00      0.99      0.99        70
           2       1.00      0.98      0.99        60
           3       1.00      0.95      0.98        66
           4       1.00      1.00      1.00        63
           5       1.00      0.96      0.98        68
           6       0.98      0.95      0.97        59
           7       1.00      1.00      1.00        52
           8       0.98      1.00      0.99        61
           9       0.85      1.00      0.92        57
          10       1.00      0.84      0.91        62
          11       0.90      1.00      0.95        53
          12       0.96      1.00      0.98        55
          13       0.98      1.00      0.99        49
          14       0.96      0.97      0.96        69
          15       1.00      1.00      1.00        72
          16       0.94      

> The Decision Tree model achieved `98.04%` accuracy, with strong precision and recall across most classes, indicating effective classification performance.

### 5. Random Forest

In [29]:
# Creating a pipeline for Random Forest Classifier

model_rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier(random_state=42))
])

In [30]:
# Fitting the training data in Random Forest model

model_rf.fit(X_train, y_train)

In [31]:
# Creating the prediction variable for Random Forest model

y_pred_rf = model_rf.predict(X_test)

In [32]:
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Accuracy: 0.9902140672782874
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        62
           1       1.00      0.99      0.99        70
           2       1.00      1.00      1.00        60
           3       1.00      1.00      1.00        66
           4       1.00      1.00      1.00        63
           5       1.00      0.97      0.99        68
           6       0.98      1.00      0.99        59
           7       1.00      1.00      1.00        52
           8       0.98      1.00      0.99        61
           9       0.97      1.00      0.98        57
          10       1.00      0.87      0.93        62
          11       0.95      1.00      0.97        53
          12       0.96      1.00      0.98        55
          13       1.00      1.00      1.00        49
          14       0.99      1.00      0.99        69
          15       1.00      1.00      1.00        72
          16       0.98     

> The Random Forest model achieved `99.02%` accuracy, demonstrating strong classification performance with high precision and recall across most classes.

### 6. K Nearest Neighbor

In [33]:
# Creating a pipeline for KNN Classifier

model_knn = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])

In [34]:
# Fitting the training data in KNN model

model_knn.fit(X_train, y_train)

In [35]:
# Creating the prediction variable for KNN model

y_pred_knn = model_knn.predict(X_test)

In [36]:
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))

Accuracy: 0.9773700305810398
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98        62
           1       0.99      0.99      0.99        70
           2       0.98      0.95      0.97        60
           3       1.00      0.98      0.99        66
           4       0.95      0.86      0.90        63
           5       1.00      0.94      0.97        68
           6       0.97      0.98      0.97        59
           7       1.00      1.00      1.00        52
           8       1.00      0.97      0.98        61
           9       0.97      1.00      0.98        57
          10       0.98      0.97      0.98        62
          11       1.00      0.98      0.99        53
          12       0.93      0.96      0.95        55
          13       0.94      0.98      0.96        49
          14       0.95      1.00      0.97        69
          15       0.88      0.96      0.92        72
          16       0.98     

> The K-Nearest Neighbors model achieved an accuracy of `97.74%`, demonstrating strong performance with high precision and recall across most classes.

**We will go ahead with The Support Vector Machine as it is the best performing model in our dataset**

# Building ChatBot API

In [41]:
class ChatBot:

    def __init__(self):
        self.activeConcern = ""
        self.requiredEntity = ""

        self.responses = {
            'cancel_order': "I see you're having trouble canceling your order. Let me assist you with that.",
            'complaint': "I'm sorry to hear that you have a complaint. Can you provide more details?",
            'contact_customer_service': "You can reach our customer service at [contact number]. How else can I assist you?",
            'contact_human_agent': "I will connect you with a human agent shortly. Please hold on.",
            'create_account': "To create an account, please provide your email and a password.",
            'change_order': "What would you like to change in your order?",
            'change_shipping_address': "Please provide your new shipping address so I can update it.",
            'check_cancellation_fee': "Let me check the cancellation fee for your order. Please hold on.",
            'check_invoices': "You can view your invoices in your account section. Would you like help finding it?",
            'check_payment_methods': "We accept various payment methods including credit cards, PayPal, and more.",
            'check_refund_policy': "Our refund policy allows you to request a refund within 30 days of purchase.",
            'delete_account': "Are you sure you want to delete your account? This action cannot be undone.",
            'delivery_options': "We offer several delivery options. Would you like to know more about them?",
            'delivery_period': "The delivery period depends on your location and the shipping method selected.",
            'edit_account': "To edit your account, please log in and go to the account settings page.",
            'get_invoice': "I can help you get your invoice. Please provide the order number.",
            'get_refund': "To initiate a refund, please provide your order details.",
            'newsletter_subscription': "You can subscribe or unsubscribe to our newsletter in your account settings.",
            'payment_issue': "I’m here to help with payment issues. Can you provide more details?",
            'place_order': "I can help you place an order. What would you like to purchase?",
            'recover_password': "Please provide your email to receive instructions for recovering your password.",
            'registration_problems': "I'm here to help with registration problems. What issue are you experiencing?",
            'review': "Would you like to leave a review for a product? Please provide the details.",
            'set_up_shipping_address': "I can assist you in setting up your shipping address. Please provide the address.",
            'switch_account': "To switch accounts, please log out and log in with the other account.",
            'track_order': "Please provide your order number, and I'll help you track your order.",
            'track_refund': "I can help you track your refund. Please provide the refund request number."
        }

    def extract_entities(self, text):
        doc = nlp(text)
        entities = {ent.label_: ent.text for ent in doc.ents}
        return entities

    def chatbot_response(self):

        # Preprocess the user query
        text = input("You: ")
        processed_query = preprocess_text(text)

        # Predict intent
        intent_encoded = model_svm.predict([processed_query])[0]
        intent = le.inverse_transform([intent_encoded])[0]
        self.activeConcern = intent

        if self.requiredEntity == "":
          print(f'Bot: {self.responses[self.activeConcern]}')
          print('Bot: Kindly provide me the details.')
          self.requiredEntity = input("You: ")
          print('Bot: The job is done. Let me know if I can help you with something else.')
          self.requiredEntity = ""
        else:
          print('Bot: The job is done. Let me know if I can help you with something else.')
          self.requiredEntity = ""


        # Fetch response based on the intent
        response = self.responses.get(intent, "Bot: Sorry, I didn't understand that. Can you please clarify?")




In [42]:
AIMODEL = ChatBot()

In [43]:
AIMODEL.chatbot_response()

You: Could you change my shipping address?
Bot: Please provide your new shipping address so I can update it.
Bot: Kindly provide me the details.
You: B34 Medows CHS
Bot: The job is done. Let me know if I can help you with something else.
