# Chatbot

## Setup

In [1]:
import glob
import re
import unicodedata
import codecs

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

%matplotlib inline

In [2]:
STOPWORDS = set(stopwords.words('english'))

In [3]:
in_files = glob.glob('../data/processed/*')

In [4]:
in_files

['../data/processed/ChequeManagement.txt',
 '../data/processed/ChequesManagement_full.txt',
 '../data/processed/ChequeStatus.txt',
 '../data/processed/IssueOfCheques.txt',
 '../data/processed/StoppedCheque.txt',
 '../data/processed/SweepConditionExample.txt',
 '../data/processed/SweepInstructions.txt',
 '../data/processed/SweepInstructions_full.txt']

In [5]:
tmp_list = []

for filepath in in_files:
    file = filepath.rsplit('/', 2)[-1]
    filename = file.split('.')[0]
    with codecs.open(filepath, 'r', encoding='utf-8', errors='ignore') as file_obj:
        content = file_obj.read()
    tmp_list.append({'FAQ_answer': content, 'FAQ_question': filename})    
    print(f'Reading {file} completed...')

Reading ChequeManagement.txt completed...
Reading ChequesManagement_full.txt completed...
Reading ChequeStatus.txt completed...
Reading IssueOfCheques.txt completed...
Reading StoppedCheque.txt completed...
Reading SweepConditionExample.txt completed...
Reading SweepInstructions.txt completed...
Reading SweepInstructions_full.txt completed...


In [6]:
input_df = pd.DataFrame(tmp_list)

In [7]:
input_df

Unnamed: 0,FAQ_answer,FAQ_question
0,Cheque Management\n---------------------------...,ChequeManagement
1,Cheque Management\r\n-------------------------...,ChequesManagement_full
2,Cheque Status\n-----------------\nYou can enqu...,ChequeStatus
3,Issue of Cheques\n-------------------------\nY...,IssueOfCheques
4,Stopped Cheque\n-------------------\nYou can s...,StoppedCheque
5,Sweep condition examples:\n(These are the thre...,SweepConditionExample
6,Sweep Instructions\n------------------\nStandi...,SweepInstructions
7,Sweep Instructions\r\n\r\nStanding instruction...,SweepInstructions_full


In [8]:
def clean(faq):
    faq = re.sub(r'[^A-Za-z]', " ", faq)
    faq = re.sub(r"\s{2,}", " ", faq)
    faq = faq.split()
    ps = nltk.PorterStemmer()
    faq = [ps.stem(word) for word in faq]
    faq = [word for word in faq if word not in STOPWORDS]
    faq = ' '.join(faq)
    return faq

In [9]:
input_df['FAQ_answer_cleaned'] = input_df['FAQ_answer'].apply(clean)

In [10]:
for row in input_df.itertuples():
    print(f'{row.FAQ_question}:-\n{row.FAQ_answer_cleaned}\n')

ChequeManagement:-
chequ manag bankfair allow issu chequ custom transact account chequ cannot issu fix instal account debit made chequ transact mechan ensur chequ number pertain concern account alreadi paid payment chequ ha stop

ChequesManagement_full:-
chequ manag bankfair allow issu chequ custom transact account chequ cannot issu fix instal account debit made chequ transact mechan ensur chequ number pertain concern account alreadi paid payment chequ ha stop issu chequ issu singl chequ chequ book transact account defin number leav chequ book contain defin sever chequ book differ number leav chequ number ha numer issu chequ book enough indic begin number chequ book system creat remain number take account number leav chequ book contain chequ number uniqu specifi charg ha collect per chequ leaf system calcul charg total number chequ issu chequ statu enquir statu particular chequ chequ issu account differ statu unpaid paid return stop issu chequ ha unpaid statu thi statu last till chequ 

In [11]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X = vectorizer.fit_transform(input_df['FAQ_answer_cleaned'])

In [12]:
vectorizer.get_feature_names()

['account',
 'account alreadi',
 'account anoth',
 'account applic',
 'account chequ',
 'account debit',
 'account defin',
 'account differ',
 'account exceed',
 'account id',
 'account number',
 'account receiv',
 'account still',
 'account stop',
 'account sweep',
 'acknowledg',
 'acknowledg custom',
 'allow',
 'allow issu',
 'alreadi',
 'alreadi paid',
 'amount',
 'amount date',
 'amount debit',
 'amount excess',
 'amount specif',
 'amount transfer',
 'an',
 'an end',
 'ani',
 'ani restrict',
 'anoth',
 'anoth balanc',
 'applic',
 'applic instal',
 'avail',
 'avail balanc',
 'balanc',
 'balanc account',
 'balanc avail',
 'balanc bzd',
 'balanc consid',
 'balanc dip',
 'balanc exce',
 'balanc exceed',
 'balanc go',
 'balanc goe',
 'bankfair',
 'bankfair allow',
 'becom',
 'becom bzd',
 'begin',
 'begin date',
 'begin number',
 'belong',
 'belong indic',
 'benefuciari',
 'benefuciari name',
 'book',
 'book balanc',
 'book contain',
 'book differ',
 'book enough',
 'book system',
 'boo

In [13]:
def similarity(test):
    test = clean(test)
    test = vectorizer.transform([test])
    similarity_score = cosine_similarity(X, test)
    faq_idx = np.argmax(similarity_score)
    return similarity_score, faq_idx

In [14]:
test1 = 'How to issue check?'
test2 = 'What is Sweep?'
test3 = 'Can you give some examples for sweep conditions'
test4 = 'What are different statuses of cheques? '

In [17]:
similarity_scores, faq_idx = similarity(test2)
print(input_df.iloc[faq_idx]['FAQ_answer'])

Sweep Instructions
------------------
Standing instructions cater to specific amounts and specific dates. But certain customers may want amounts to be transferred from one account to another if the balance in the accounts exceed a particular level or when the balances go below a certain level. Such instructions are referred to as Sweep Instructions.
Sweep Instructions involve Transaction accounts. They are not applicable to Installment or Fixed Deposits accounts.

Sweep instructions (SO) have -
A Begin date for execution
An End date for execution
Condition for sweep - clear balance exceeding a level or balance dipping below a level
(book balance and available balance are not considered for Standing Instructions or Sweep Instructions).
Accounts that receives credit and debit
Frequency of Execution - daily, wekly, monthly,quarterly, half yearly and yearly
Next due date for execution
Sweep Instruction charges  
Counters that indicates the number of times the execution has taken place and 

In [18]:
similarity_scores

array([[0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.03159785],
       [0.30499316],
       [0.19255778]])

In [19]:
print(input_df.iloc[5]['FAQ_answer'])

Sweep condition examples:
(These are the three variants of SO)
1. If (Clear) Balance exceeds BZD 50000 in Account Id 123, transfer amounts in excess of BZD 50000 to Account Id 345.

2. If balance goes below BZD 10000 in Account Id 345, transfer money from Account Id657.
(Amount transferred will be BZD10000 - Clear Balance in Account Id 345).
This amount is debited to Account Id 657 and Credited to account Id 345; balance in account Id 345 now becomes BZD10000.

3. If negative balance exceeds BZD20000 in Account Id345,  transfer money from Account Id657.
(Amount transferred will be negative Clear Balance - BZD10000).
This amount is debited to Account Id 657 and Credited to account Id 345; balance in account Id 345 now becomes BZD10000.

Every SO has an Id.
This Id is mentioned in the transaction narration.
