# DSE Dissertation

## Topic Modelling

#### Load Packages

In [59]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import seaborn as sn

#### Load Dataset

In [60]:
"""

Loading dataset

"""

class UtteranceExample:
    def __init__(self, text, label, do_lower_case):
        self.original_text = text
        self.text = text
        self.label = label

        if do_lower_case:
            self.text = self.text.lower()

    def to_dict(self):
      return {
          'utterance': self.original_text,
          'intent': self.label,
      }
        
def load_utterance_examples(file_path, do_lower_case=True):
    examples = []

    with open('{}/seq.in'.format(file_path), 'r', encoding="utf-8") as f_text, open('{}/label'.format(file_path), 'r', encoding="utf-8") as f_label:
        for text, label in zip(f_text, f_label):
            e = UtteranceExample(text.strip(), label.strip(), do_lower_case)
            examples.append(e)

    return examples



In [61]:
data = load_utterance_examples("/content/sample_data")

utterances= pd.DataFrame.from_records([d.to_dict() for d in data])

print(utterances)

                                              utterance           intent
0                        i am still waiting on my card?     card_arrival
1     what can i do if my card still hasn't arrived ...     card_arrival
2     i have been waiting over a week. is the card s...     card_arrival
3     can i track my card while it is in the process...     card_arrival
4     how do i know if i will get my card, or if it ...     card_arrival
...                                                 ...              ...
8617             you provide support in what countries?  country_support
8618                 what countries are you supporting?  country_support
8619                what countries are getting support?  country_support
8620                     are cards available in the eu?  country_support
8621                   which countries are represented?  country_support

[8622 rows x 2 columns]


### Exploratory Data Analysis

In [62]:
utterances.shape

(8622, 2)

In [63]:
utterances.dropna(axis=0,inplace=True)#dropping na
utterances.shape

(8622, 2)

In [64]:
utterances.drop_duplicates(subset=['utterance'],inplace=True)#dropping duplicates
utterances.shape

(8618, 2)

In [65]:
type_counts_train = utterances['intent'].value_counts()

print(type_counts_train)

"""

THIS SHOWS THE SENTENCES PER INTENT ARE IMBALANCED.

TOTAL OF 77 INTENT CLASSES

"""

card_payment_fee_charged                            167
direct_debit_payment_not_recognised                 162
balance_not_updated_after_cheque_or_cash_deposit    162
wrong_amount_of_cash_received                       160
transaction_charged_twice                           158
                                                   ... 
lost_or_stolen_card                                  66
card_acceptance                                      47
card_swallowed                                       45
virtual_card_not_working                             32
contactless_not_working                              30
Name: intent, Length: 77, dtype: int64


'\n\nTHIS SHOWS THE SENTENCES PER INTENT ARE IMBALANCED.\n\nTOTAL OF 77 INTENT CLASSES\n\n'

In [66]:
utterances.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8618 entries, 0 to 8621
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   utterance  8618 non-null   object
 1   intent     8618 non-null   object
dtypes: object(2)
memory usage: 202.0+ KB


In [67]:
utterances.describe()

Unnamed: 0,utterance,intent
count,8618,8618
unique,8618,77
top,i am still waiting on my card?,card_payment_fee_charged
freq,1,167


In [68]:
"""

Few samples of the dataset.

"""

utterances.head()

Unnamed: 0,utterance,intent
0,i am still waiting on my card?,card_arrival
1,what can i do if my card still hasn't arrived ...,card_arrival
2,i have been waiting over a week. is the card s...,card_arrival
3,can i track my card while it is in the process...,card_arrival
4,"how do i know if i will get my card, or if it ...",card_arrival


In [69]:
utterances.groupby('intent').describe()

Unnamed: 0_level_0,utterance,utterance,utterance,utterance
Unnamed: 0_level_1,count,unique,top,freq
intent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Refund_not_showing_up,143,143,i don't see my refund money yet in my account....,1
activate_my_card,141,141,please help me with my card. it won't activate.,1
age_limit,92,92,how old do you need to be to use the banks ser...,1
apple_pay_or_google_pay,108,108,"hi, i have an apple watch. how do i use it to ...",1
atm_support,68,68,i can use this card at which atms?,1
...,...,...,...,...
virtual_card_not_working,32,32,why isn't my disposable virtual card working?,1
visa_or_mastercard,115,115,is there a way i can have a mastercard?,1
why_verify_identity,102,102,why do you have an identity check?,1
wrong_amount_of_cash_received,160,160,why did i only receive a partial amount of wha...,1


In [70]:
#Look for missing intents
utterances['intent'].isnull().sum()

0

### Data Pre-processing.
- Spelling correction.
- Remove Punctuation.
- Remove stop words.
- Tokenization.
- Lemmatization.


In [71]:
! pip install -U symspellpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [72]:
"""

Spelling correction using Symspell.


"""
from symspellpy import SymSpell
import pkg_resources
from itertools import islice

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")

# term_index is the column of the term and count_index is the
# column of the term frequency
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

# Print out first 5 elements to demonstrate that dictionary is
# successfully loaded
print(list(islice(sym_spell.bigrams.items(), 5)))
print(list(islice(sym_spell.words.items(), 5)))

# lookup suggestions for multi-word input strings (supports compound
# splitting & merging)
input_term = (
    "how do i loacte my card?"
)
# max edit distance per lookup (per single word, not per whole input string)
suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)
# display suggestion term, edit distance, and term frequency
for suggestion in suggestions:
    print(suggestion)

def correction(input_term, max_edit_distance=2):
  suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)
  #for suggestion in suggestions:
  #  print(suggestion.term)
  return suggestions[0].term

[('abcs of', 10956800), ('aaron and', 10721728), ('abbott and', 7861376), ('abbreviations and', 13518272), ('aberdeen and', 7347776)]
[('the', 23135851162), ('of', 13151942776), ('and', 12997637966), ('to', 12136980858), ('a', 9081174698)]
how do i locate my card, 2, 0


In [73]:
# call the function

corrected_utterances = []

for t in utterances['utterance']:
    corrected_utterances.append(correction(t)) 

corrected_utterances[:5]
utterances['utterance'] = corrected_utterances
utterances.head()

Unnamed: 0,utterance,intent
0,i am still waiting on my card,card_arrival
1,what can i do if my card still hasn't arrived ...,card_arrival
2,i have been waiting over a week is the card st...,card_arrival
3,can i track my card while it is in the process...,card_arrival
4,how do i know if i will get my card or if it i...,card_arrival


In [74]:
from nltk.corpus import stopwords
import re

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [75]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

In [76]:
stop_words = set(stopwords.words('english')) 

def text_cleaner(text):
    newString = text.lower()
    newString = re.sub(r'\([^)]*\)', '', newString)   # removing special characters
    newString = re.sub('"','', newString)              # removing '"' 
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    # replacing slangs
    newString = re.sub(r"'s\b","",newString)    # replacing 's with "" '
    newString = re.sub("[^a-z]", " ", newString)    #only keeping letters
    newString = re.sub('[m]{2,}', 'mm', newString)    # "mmmmmm" -> "mm"
        
    tokens = [w for w in newString.split() if not w in stop_words]     # removing the stopwords

    long_words=[]
    for i in tokens:
        if len(i)>1:                               #removing short word
            long_words.append(i)   
    return (" ".join(long_words)).strip()

In [77]:
#call the function
cleaned_text = []
for t in utterances['utterance']:
    cleaned_text.append(text_cleaner(t)) 

cleaned_text[:5]  

['still waiting card',
 'card still arrived weeks',
 'waiting week card still coming',
 'track card process delivery',
 'know get card lost']

In [78]:
utterances['cleaned_utterance'] = cleaned_text
utterances.head()

Unnamed: 0,utterance,intent,cleaned_utterance
0,i am still waiting on my card,card_arrival,still waiting card
1,what can i do if my card still hasn't arrived ...,card_arrival,card still arrived weeks
2,i have been waiting over a week is the card st...,card_arrival,waiting week card still coming
3,can i track my card while it is in the process...,card_arrival,track card process delivery
4,how do i know if i will get my card or if it i...,card_arrival,know get card lost


In [79]:
#Drop empty rows

utterances.replace('', np.nan, inplace=True)
utterances.dropna(axis=0,inplace=True)

utterances.head()

Unnamed: 0,utterance,intent,cleaned_utterance
0,i am still waiting on my card,card_arrival,still waiting card
1,what can i do if my card still hasn't arrived ...,card_arrival,card still arrived weeks
2,i have been waiting over a week is the card st...,card_arrival,waiting week card still coming
3,can i track my card while it is in the process...,card_arrival,track card process delivery
4,how do i know if i will get my card or if it i...,card_arrival,know get card lost


In [80]:
#Lemmatization

nltk.download("wordnet")
nltk.download('omw-1.4')
wordnetLemmatizer = nltk.WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [81]:
def lemma(tokenized_text):
  text = [wordnetLemmatizer.lemmatize(word) for word in tokenized_text]
  return (" ".join(text)).strip()

utterances['lemmatized_utterance'] = utterances['cleaned_utterance'].apply(lambda x: lemma(x.split(" ")))
utterances.head()

Unnamed: 0,utterance,intent,cleaned_utterance,lemmatized_utterance
0,i am still waiting on my card,card_arrival,still waiting card,still waiting card
1,what can i do if my card still hasn't arrived ...,card_arrival,card still arrived weeks,card still arrived week
2,i have been waiting over a week is the card st...,card_arrival,waiting week card still coming,waiting week card still coming
3,can i track my card while it is in the process...,card_arrival,track card process delivery,track card process delivery
4,how do i know if i will get my card or if it i...,card_arrival,know get card lost,know get card lost


### Converting each utterance into a Vector

In [82]:
"""
Now convert each message into a vector that machine learning models can understand.

"""

#CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer().fit(utterances['lemmatized_utterance'])

print(len(vectorizer.vocabulary_))


1811


In [83]:
# n-gram vectorizer
ngram_vectorizer = CountVectorizer(ngram_range=(2,2)).fit(utterances['lemmatized_utterance'])
print(len(ngram_vectorizer.vocabulary_))


14750


###Model Building using tf-idf (term frequency - inverse document frequency) features


In [84]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Training TfidfVectorizer

tfidfVectorizer = TfidfVectorizer()
X = tfidfVectorizer.fit_transform(utterances['lemmatized_utterance'])
X = np.array(X.todense())

y = utterances['intent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)


In [85]:
X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Using MultiNomialNB (Naive Bayes) Model for Classification

In [86]:
from sklearn.naive_bayes import MultinomialNB

intent_classification_model = MultinomialNB().fit(X_train, y_train)

### Model Testing

In [87]:
y_pred = intent_classification_model.predict(X_test)

y_pred

array(['pending_transfer', 'pending_transfer', 'compromised_card', ...,
       'declined_cash_withdrawal', 'request_refund',
       'direct_debit_payment_not_recognised'], dtype='<U48')

### Accuracy of the model

In [88]:
from sklearn.metrics import classification_report, accuracy_score, f1_score

print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred, average="macro"))

print (classification_report(y_test, y_pred))

Accuracy: 0.7848027842227379
F1 score: 0.7497713128916921
                                                  precision    recall  f1-score   support

                           Refund_not_showing_up       0.93      0.90      0.91        29
                                activate_my_card       0.70      0.93      0.80        28
                                       age_limit       1.00      0.83      0.91        18
                         apple_pay_or_google_pay       0.85      1.00      0.92        22
                                     atm_support       1.00      0.36      0.53        14
                                automatic_top_up       0.88      0.95      0.91        22
         balance_not_updated_after_bank_transfer       0.59      0.77      0.67        31
balance_not_updated_after_cheque_or_cash_deposit       0.71      0.97      0.82        33
                         beneficiary_not_allowed       0.73      0.81      0.77        27
                                 cancel_t

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
