In [1]:

import joblib
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

In [2]:
df = pd.read_csv(r"C:\Users\hp\Downloads\data for projects\Sample_Customer_Service_Training_Dataset_chatbot.csv")

In [3]:
df.head()

Unnamed: 0,flags,utterance,category,intent
0,BM,I have problems with canceling an order,ORDER,cancel_order
1,BIM,how can I find information about canceling ord...,ORDER,cancel_order
2,B,I need help with canceling the last order,ORDER,cancel_order
3,BIP,could you help me cancelling the last order I ...,ORDER,cancel_order
4,B,problem with cancelling an order I made,ORDER,cancel_order


In [4]:
len(df.intent.value_counts())

27

In [5]:
label_intent = preprocessing.LabelEncoder()
df['label_num'] = label_intent.fit_transform(df.intent)
df.head()

Unnamed: 0,flags,utterance,category,intent,label_num
0,BM,I have problems with canceling an order,ORDER,cancel_order,0
1,BIM,how can I find information about canceling ord...,ORDER,cancel_order,0
2,B,I need help with canceling the last order,ORDER,cancel_order,0
3,BIP,could you help me cancelling the last order I ...,ORDER,cancel_order,0
4,B,problem with cancelling an order I made,ORDER,cancel_order,0


In [6]:
label_intent.__dict__

{'classes_': array(['cancel_order', 'change_order', 'change_shipping_address',
        'check_cancellation_fee', 'check_invoice', 'check_payment_methods',
        'check_refund_policy', 'complaint', 'contact_customer_service',
        'contact_human_agent', 'create_account', 'delete_account',
        'delivery_options', 'delivery_period', 'edit_account',
        'get_invoice', 'get_refund', 'newsletter_subscription',
        'payment_issue', 'place_order', 'recover_password',
        'registration_problems', 'review', 'set_up_shipping_address',
        'switch_account', 'track_order', 'track_refund'], dtype=object)}

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.utterance, df.label_num, test_size = 0.2, random_state = 2022, stratify = df.label_num)
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (6540,)
Shape of X_test:  (1635,)


In [8]:
clf_tfid = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('LogisticRegress', LogisticRegression(C=1.0, penalty = 'l2', max_iter=100))
])
clf_tfid.fit(X_train, y_train)
y_pred = clf_tfid.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97        61
           1       1.00      0.95      0.97        61
           2       1.00      1.00      1.00        59
           3       0.98      1.00      0.99        60
           4       1.00      1.00      1.00        65
           5       1.00      0.98      0.99        59
           6       0.98      0.98      0.98        60
           7       1.00      1.00      1.00        60
           8       0.98      1.00      0.99        60
           9       1.00      1.00      1.00        59
          10       0.97      0.93      0.95        60
          11       0.98      1.00      0.99        60
          12       0.98      1.00      0.99        60
          13       1.00      1.00      1.00        60
          14       1.00      0.97      0.98        59
          15       1.00      1.00      1.00        65
          16       1.00      0.97      0.98        59
          17       1.00    

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
clf_bow = Pipeline([ 
    ('vectorizer_tfidf', CountVectorizer()),
    ('LogisticRegress', LogisticRegression(C=1.0, penalty ='l2', max_iter=100))
])
clf_bow.fit(X_train, y_train)

y_pred = clf_bow.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97        61
           1       1.00      0.93      0.97        61
           2       1.00      1.00      1.00        59
           3       0.97      1.00      0.98        60
           4       1.00      1.00      1.00        65
           5       1.00      1.00      1.00        59
           6       1.00      0.98      0.99        60
           7       1.00      1.00      1.00        60
           8       0.98      1.00      0.99        60
           9       1.00      1.00      1.00        59
          10       0.97      0.95      0.96        60
          11       1.00      1.00      1.00        60
          12       1.00      1.00      1.00        60
          13       1.00      1.00      1.00        60
          14       1.00      1.00      1.00        59
          15       1.00      1.00      1.00        65
          16       1.00      0.98      0.99        59
          17       1.00    