In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import json
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords  
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.linear_model import SGDClassifier
from collections import Counter
import nlpaug.augmenter.word as naw
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

In [2]:
faq_file = open("./HDFC_Faq.txt").read()
data = json.loads(faq_file)
df = pd.json_normalize(data)

In [3]:
df.head()

Unnamed: 0,question,answer,found_duplicate
0,How do I change my password?,"After you have logged in, you can change your ...",False
1,When will I receive my changed ATM PIN?,You will receive your new ATM PIN by post with...,False
2,Can I get my newly generated PIN online?,"No, for security reasons we send you your ATM ...",False
3,How can I register for Autopay?,To register for Autopay: Step 1: Click on the ...,False
4,Can Chip Credit cards be used anywhere?,"Yes, your HDFC Bank Chip Credit card can be us...",False


In [4]:
df.drop(["found_duplicate"],axis=1,inplace=True)

In [5]:
df.isnull().sum()

question    0
answer      0
dtype: int64

In [6]:
df.head()

Unnamed: 0,question,answer
0,How do I change my password?,"After you have logged in, you can change your ..."
1,When will I receive my changed ATM PIN?,You will receive your new ATM PIN by post with...
2,Can I get my newly generated PIN online?,"No, for security reasons we send you your ATM ..."
3,How can I register for Autopay?,To register for Autopay: Step 1: Click on the ...
4,Can Chip Credit cards be used anywhere?,"Yes, your HDFC Bank Chip Credit card can be us..."


In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
stop_words=stopwords.words('english')

In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
def clean_data(text):
    text=text.lower() #lower the text
    text = re.sub(r'[^\w\s]', '', text) #remove irrelevant characters    
    text = text.split() #convert sentence to tokens
    text = [lemmatizer.lemmatize(word) for word in text] #lemmatization
    text = " ".join(text) #converting tokens to sentence
    return text

In [10]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [11]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [12]:
import nlpaug.augmenter.word as naw

In [13]:
import os
os.getcwd()

'C:\\Users\\sovan\\Documents\\faq-chatbot-main'

In [14]:
aug = naw.SynonymAug(aug_src='wordnet')

In [15]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [19]:
import nltk
print(nltk.data.path)


['C:\\Users\\sovan/nltk_data', 'C:\\Users\\sovan\\AppData\\Local\\Programs\\Python\\Python313\\nltk_data', 'C:\\Users\\sovan\\AppData\\Local\\Programs\\Python\\Python313\\share\\nltk_data', 'C:\\Users\\sovan\\AppData\\Local\\Programs\\Python\\Python313\\lib\\nltk_data', 'C:\\Users\\sovan\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


In [17]:
nltk.data.path.append(r"C:\Users\sovan\AppData\Roaming\nltk_data")

In [18]:
from tqdm import tqdm

In [19]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [20]:
import nltk
nltk.pos_tag(['test'], lang='eng')

[('test', 'NN')]

In [21]:
aug_data = {}

for ques, ans in tqdm(zip(df['question'], df['answer'])):
    for i in range(4):
        augmented = aug.augment(ques)
        if isinstance(augmented, list):
            augmented = augmented[0]
        aug_data[augmented] = ans

2236it [00:05, 391.54it/s]


In [22]:
aug_df = pd.DataFrame(aug_data.items(),columns=['question','answer'])
aug_df.head()

Unnamed: 0,question,answer
0,How come One change my password?,"After you have logged in, you can change your ..."
1,How do Single alter my password?,"After you have logged in, you can change your ..."
2,How do Ace alter my password?,"After you have logged in, you can change your ..."
3,How set Ane modify my password?,"After you have logged in, you can change your ..."
4,When leave I receive my change ATM PIN?,You will receive your new ATM PIN by post with...


In [23]:
final_df = pd.concat([df,aug_df])

In [25]:
final_df.to_csv("argument.csv",index=False)

In [26]:
final_df = pd.read_csv("argument.csv")

In [27]:
X = final_df['question']
y = final_df['answer']

In [28]:
le = LabelEncoder()

In [29]:
y = le.fit_transform(y)

In [31]:
import pandas as pd
pd.Series(y).value_counts()

1877    30
669     25
1564    25
1793    20
1618    20
        ..
681      3
697      3
1460     3
404      3
1013     2
Name: count, Length: 2160, dtype: int64

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=100, test_size=0.2
)

In [33]:
tf = TfidfVectorizer(
    ngram_range=(1, 3),
    min_df=1,
    stop_words='english'
)

In [34]:
tf = TfidfVectorizer(
    ngram_range=(1,3),
    min_df=1,
    stop_words='english'
)

X_train_tf = tf.fit_transform(X_train)
X_test_tf = tf.transform(X_test)

In [35]:
model = SGDClassifier(n_jobs=-1,random_state=100,loss='modified_huber',alpha=0.0005)
model.fit(X_train_tf,y_train)

0,1,2
,loss,'modified_huber'
,penalty,'l2'
,alpha,0.0005
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [36]:
y_pred = model.predict(X_test_tf)

In [37]:
labels = np.unique(y_test)
ytest_prob = label_binarize(y_test, classes=labels)
ypred_prob = label_binarize(y_pred, classes=labels)

  y_type = type_of_target(y)
  y_type = type_of_target(y)


In [38]:
print("Accuracy Score:",accuracy_score(y_test,y_pred))
print("Precision Score:",precision_score(y_test,y_pred,average='micro'))
print("Recall Score:",recall_score(y_test,y_pred,average='micro'))
print("ROC-AUC Score:",roc_auc_score(ytest_prob,ypred_prob,multi_class='ovo',average='micro'))

  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)


Accuracy Score: 0.676164631388512
Precision Score: 0.676164631388512
Recall Score: 0.676164631388512
ROC-AUC Score: 0.8380169141332774


In [39]:
idx = 2 
print("Question:", X_test.iloc[idx])
print("Predicted Answer:", le.inverse_transform([model.predict(X_test_tf[idx])[0]])[0])
print("Actual Answer:", le.inverse_transform([y_test[idx]])[0])

Question: Can the OTP comprise generated prior to the transaction?
Predicted Answer: No, OTP can be generated only during the transactions.
Actual Answer: No, OTP can be generated only during the transactions.


In [40]:
question = "How do I check the IFSC of the receiving branch?"


clean_q = clean_data(question)
vec_q = tf.transform([clean_q])

prediction = model.predict(vec_q)
confidence = np.max(model.predict_proba(vec_q))


print(f"Question: {question}")
print(f"Confidence: {round(confidence, 3)}")


THRESHOLD = 0.25

if confidence >= THRESHOLD:
    answer = le.inverse_transform(prediction)[0]
    print("\nPredicted Answer:\n", answer)
else:
    print("\nPredicted Answer:")
    print("Sorry, I am not confident about this question.")
    print("You may rephrase your question.")

print("-" * 80)

Question: How do I check the IFSC of the receiving branch?
Confidence: 0.605

Predicted Answer:
 Just ask your beneficiary to get the IFSC from his or her branch. They can also find it in their cheque book - the IFSC is in the cheque leaf.
--------------------------------------------------------------------------------


In [42]:
test_questions = [
    "How do I cancel my auto pay instruction?",
    "What information can I view under account information with the Online Credit Card facility?",
    "how to generate green pin",
    "How can I view my monthly card statement?",
]

CONFIDENCE_THRESHOLD = 0.25   # adjust if needed

for question in test_questions:
    
    clean_q = clean_data(question)
    vec_q = tf.transform([clean_q])
    
    prediction = model.predict(vec_q)
    confidence = np.max(model.predict_proba(vec_q))
    
    print("Question:", question)
    print("Confidence:", round(confidence, 3))
    
    if confidence < CONFIDENCE_THRESHOLD:
        print("Predicted Answer: Sorry, this question is out of my knowledge base.")
    else:
        print("Predicted Answer:", le.inverse_transform(prediction)[0])
    
    print("-" * 80)

Question: How do I cancel my auto pay instruction?
Confidence: 0.413
Predicted Answer: We request you to send across a duly signed letter for cancellation of the auto debit facility incorporated on your card account.The letter may be sent to the following address:HDFC Bank Cards DivisionPO BOX # 8654Thiruvanmiyur P.O. Chennai 600
--------------------------------------------------------------------------------
Question: What information can I view under account information with the Online Credit Card facility?
Confidence: 0.474
Predicted Answer: Under your account information you can view the following
--------------------------------------------------------------------------------
Question: how to generate green pin
Confidence: 0.267
Predicted Answer: Through NetBankingLogon to NetBanking and click on credit card tabStep 1: Click on the "Request"Step 2: Select your Credit Card ATM PIN option on the left hand side MenuStep 3: Select your Credit Card number from the drop down box and cli