In [31]:
import pandas as pd
import numpy as np
#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#for word embedding
import gensim
from gensim.models import Word2Vec
from xgboost import XGBClassifier
import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nanth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nanth\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nanth\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
df_train= pd.read_csv(r'..\data\BankFAQs.csv')


In [33]:
x=df_train['Class'].value_counts()
print(x)


Class
insurance        469
cards            403
loans            375
accounts         306
investments      140
security          57
fundstransfer     14
Name: count, dtype: int64


In [34]:
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

 
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)
#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [35]:
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
df_train['clean_text'] = df_train['Question'].apply(lambda x: finalpreprocess(x))
df_train.head()

Unnamed: 0,Question,Answer,Class,clean_text
0,Do I need to enter ‘#’ after keying in my Card...,Please listen to the recorded message and foll...,security,need enter key card number card expiry date cv...
1,What details are required when I want to perfo...,"To perform a secure IVR transaction, you will ...",security,detail require want perform secure ivr transac...
2,How should I get the IVR Password if I hold a...,An IVR password can be requested only from the...,security,get ivr password hold add card
3,How do I register my Mobile number for IVR Pas...,Please call our Customer Service Centre and en...,security,register mobile number ivr password
4,How can I obtain an IVR Password,By Sending SMS request: Send an SMS 'PWD<space...,security,obtain ivr password


In [36]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_train['Class_encoded'] =  le.fit_transform(df_train['Class'])
df_train['Class_encoded'].value_counts()

Class_encoded
3    469
1    403
5    375
0    306
4    140
6     57
2     14
Name: count, dtype: int64

In [37]:
#SPLITTING THE TRAINING DATASET INTO TRAIN AND TEST
X_train, X_test, y_train, y_test = train_test_split(df_train["clean_text"],df_train["Class_encoded"],test_size=0.2,shuffle=True)
#Word2Vec
# Word2Vec runs on tokenized sentences
X_train_tok= [nltk.word_tokenize(i) for i in X_train]  
X_test_tok= [nltk.word_tokenize(i) for i in X_test]

In [38]:
#Tf-Idf
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

pickle.dump(tfidf_vectorizer, open(r"..\model_artifacts\tfidf1.pkl", "wb"))

#building Word2Vec model
# class MeanEmbeddingVectorizer(object):
#     def __init__(self, word2vec):
#         self.word2vec = word2vec
#         # if a text is empty we should return a vector of zeros
#         # with the same dimensionality as all the other vectors
#         self.dim = len(next(iter(word2vec.values())))
# def fit(self, X, y):
#         return self
# def transform(self, X):
#         return np.array([
#             np.mean([self.word2vec[w] for w in words if w in self.word2vec]
#                     or [np.zeros(self.dim)], axis=0)
#             for words in X
#         ])
# df_train['clean_text_tok']=[nltk.word_tokenize(i) for i in df_train['clean_text']]
# model = Word2Vec(df_train['clean_text_tok'],min_count=1)     
#modelw = MeanEmbeddingVectorizer(model)
# converting text to numerical data using Word2Vec
# X_train_vectors_w2v = model.fit_transform(X_train_tok)
# X_val_vectors_w2v = model.transform(X_test_tok)

In [39]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  
#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
print('f1_score :', f1_score(y_test,y_predict,average='weighted'))
pickle.dump(lr_tfidf,open('..\model_artifacts\logistic_reg.pkl','wb')) 

              precision    recall  f1-score   support

           0       0.91      0.91      0.91        57
           1       0.98      0.95      0.96        83
           2       1.00      0.25      0.40         4
           3       0.94      0.99      0.96        97
           4       0.91      0.80      0.85        25
           5       0.95      0.97      0.96        75
           6       0.77      0.83      0.80        12

    accuracy                           0.94       353
   macro avg       0.92      0.82      0.84       353
weighted avg       0.94      0.94      0.94       353

Confusion Matrix: [[52  0  0  1  2  1  1]
 [ 0 79  0  1  0  1  2]
 [ 2  0  1  0  0  1  0]
 [ 0  0  0 96  0  1  0]
 [ 2  1  0  2 20  0  0]
 [ 1  0  0  1  0 73  0]
 [ 0  1  0  1  0  0 10]]
f1_score : 0.9350351947918704


In [40]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)

lr_tfidf=XGBClassifier()
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  
#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
print('f1_score :', f1_score(y_test,y_predict,average='weighted'))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90        57
           1       0.95      0.92      0.93        83
           2       1.00      0.25      0.40         4
           3       0.82      0.95      0.88        97
           4       0.78      0.56      0.65        25
           5       0.95      0.93      0.94        75
           6       0.75      0.75      0.75        12

    accuracy                           0.89       353
   macro avg       0.88      0.75      0.78       353
weighted avg       0.89      0.89      0.88       353

Confusion Matrix: [[51  1  0  3  1  0  1]
 [ 0 76  0  4  1  0  2]
 [ 2  0  1  0  0  1  0]
 [ 1  0  0 92  1  3  0]
 [ 2  2  0  7 14  0  0]
 [ 0  0  0  4  1 70  0]
 [ 0  1  0  2  0  0  9]]
f1_score : 0.8827084930638133


In [41]:
class_label = df_train[['Class_encoded','Class',]].drop_duplicates().reset_index(drop=True)
class_label_dict = {}
for i in range(class_label.shape[0]):
    class_label_dict[class_label.iat[i,0]] = class_label.iat[i,1]

with open('..\model_artifacts\class_label_dict.pkl', 'wb') as f:
    pickle.dump(class_label_dict, f)

# Inference_code


In [42]:
import sys
sys.path.append(r"../")

from utils.preprocess import finalpreprocess
import pickle

with open('..\model_artifacts\class_label_dict.pkl', 'rb') as f:
    class_label_dict = pickle.load(f)

question = 'What details are required when I want to perform a secure IVR transaction'

question_pp = finalpreprocess(question)
tfidf_vectorizer = pickle.load(open(r'..\model_artifacts\tfidf1.pkl','rb'))
question_vec = tfidf_vectorizer.transform([question_pp])
lr_tfidf = pickle.load(open('..\model_artifacts\logistic_reg.pkl','rb')) 
pred = lr_tfidf.predict(question_vec)
print(class_label_dict[pred[0]])


security
