In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

import nltk
try:
    nltk.data.find('tokenizers/punkt')
except:
    nltk.download('punkt')

In [2]:
df = pd.read_csv("/kaggle/input/training-dataset-for-chatbotsvirtual-assistants/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.csv")
df.head()

Unnamed: 0,flags,utterance,category,intent
0,BILC,"I don't have an online account, what do I have...",ACCOUNT,create_account
1,BILQZ,can you tell me if i can regisger two accounts...,ACCOUNT,create_account
2,BPLC,"I have no online account, open one, please",ACCOUNT,create_account
3,BIPLD,"could you ask an agent how to open an account,...",ACCOUNT,create_account
4,BLQC,"i want an online account, create one",ACCOUNT,create_account


In [3]:
len(df.intent.value_counts())

27

# Without preprocessing

In [4]:
label_intent = preprocessing.LabelEncoder()
df['label_num'] = label_intent.fit_transform(df.intent)
df.head()

Unnamed: 0,flags,utterance,category,intent,label_num
0,BILC,"I don't have an online account, what do I have...",ACCOUNT,create_account,10
1,BILQZ,can you tell me if i can regisger two accounts...,ACCOUNT,create_account,10
2,BPLC,"I have no online account, open one, please",ACCOUNT,create_account,10
3,BIPLD,"could you ask an agent how to open an account,...",ACCOUNT,create_account,10
4,BLQC,"i want an online account, create one",ACCOUNT,create_account,10


In [5]:
label_intent.__dict__

{'classes_': array(['cancel_order', 'change_order', 'change_shipping_address',
        'check_cancellation_fee', 'check_invoices',
        'check_payment_methods', 'check_refund_policy', 'complaint',
        'contact_customer_service', 'contact_human_agent',
        'create_account', 'delete_account', 'delivery_options',
        'delivery_period', 'edit_account', 'get_invoice', 'get_refund',
        'newsletter_subscription', 'payment_issue', 'place_order',
        'recover_password', 'registration_problems', 'review',
        'set_up_shipping_address', 'switch_account', 'track_order',
        'track_refund'], dtype=object)}

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.utterance, df.label_num, test_size = 0.2, random_state= 2022, stratify = df.label_num)
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (17227,)
Shape of X_test:  (4307,)


In [7]:
clf_tfid = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('LogisticRegress', LogisticRegression(C=1.0, penalty='l2', max_iter=5))         
])
clf_tfid.fit(X_train, y_train)


y_pred = clf_tfid.predict(X_test)


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.91      0.78      0.84       185
           2       0.00      0.00      0.00        22
           3       0.00      0.00      0.00        72
           4       0.85      0.69      0.76       203
           5       0.00      0.00      0.00        54
           6       0.00      0.00      0.00        96
           7       1.00      0.86      0.92       149
           8       0.80      1.00      0.89       411
           9       1.00      1.00      1.00       205
          10       0.80      1.00      0.89       424
          11       1.00      0.89      0.94       183
          12       0.00      0.00      0.00        72
          13       0.00      0.00      0.00        28
          14       0.00      0.00      0.00        27
          15       0.53      1.00      0.70       286
          16       0.70      0.98      0.81       230
          17       0.00    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
from sklearn.feature_extraction.text import CountVectorizer


clf_bow = Pipeline([
     ('vectorizer_tfidf',CountVectorizer()),    
     ('LogisticRegress', LogisticRegression(C=1.0, penalty='l2', max_iter=5))         
])
clf_bow.fit(X_train, y_train)


y_pred = clf_bow.predict(X_test)


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.80      0.98      0.88       185
           2       0.00      0.00      0.00        22
           3       1.00      0.74      0.85        72
           4       0.84      0.98      0.91       203
           5       0.00      0.00      0.00        54
           6       1.00      0.96      0.98        96
           7       1.00      0.99      0.99       149
           8       0.94      1.00      0.97       411
           9       0.95      0.99      0.97       205
          10       0.86      1.00      0.92       424
          11       0.86      1.00      0.92       183
          12       0.92      0.97      0.95        72
          13       0.00      0.00      0.00        28
          14       0.00      0.00      0.00        27
          15       0.91      1.00      0.95       286
          16       0.94      0.97      0.96       230
          17       0.00    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
