# Final function

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import re, string, nltk, spacy, pickle
import random as rn
from bs4 import BeautifulSoup
from datetime import datetime
import time
import warnings
warnings.filterwarnings("ignore")
import nltk.translate.bleu_score as bleu

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.layers import Embedding, LSTM, Dense, Softmax
from tensorflow.keras.layers import Bidirectional, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Spelling Corrections

# Install package
!pip install symspellpy

# Using symspell to correct spelling
import pkg_resources
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")

sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

Collecting symspellpy
[?25l  Downloading https://files.pythonhosted.org/packages/99/af/e71fcca6a42b6a63f518b0c1627e1f67822815cb0cf71e6af05acbd75c78/symspellpy-6.7.0-py3-none-any.whl (2.6MB)
[K     |████████████████████████████████| 2.6MB 4.9MB/s 
Installing collected packages: symspellpy
Successfully installed symspellpy-6.7.0


True

In [None]:
# Loading functions from modules
from TextCleaning import clean_text  # Cleaning
from Preprocessing import preprocess  # Preprocess
from Interference import beam_predict, predict  # Prediction
from GetModel import get_model  # To get the compiled model

## Loading files

In [None]:
#-------------------------------------
# Parameters
#-------------------------------------

# fixing numpy RS
np.random.seed(42)
# fixing tensorflow RS
tf.random.set_seed(32)
# python RS
rn.seed(12)

# Taking maximum words 38
MAXLEN = 39
QSN_VOCAB_SIZE = 46789
ANS_VOCAB_SIZE = 25445
EMBEDDING_SIZE = 300
LSTM_UNITS = 128
BATCH_SIZE = 480
SCORE_FUN = "concat"

# Path to load all necessary files
FILEPATH = './preprocessed/'

In [None]:
# Raw data file
raw_data = pickle.load(open('data_conv.pkl', 'rb'))

# Loading data
#train, validation = pickle.load(open(FILEPATH+'spelldata_train_val.pkl', 'rb'))

# Load tokenizers
enc_tokenizer, dec_tokenizer = pickle.load(open(FILEPATH+'spelltokenizer_obj.pkl', 'rb'))

# Load embedding matrix
qsn_embedding_matrix, ans_embedding_matrix = pickle.load(open(FILEPATH+'spellembedding_matrix.pkl', 'rb'))

# Model's weight
MODEL_WEIGHT_PATH = FILEPATH + 'checkpoint'

## Model

In [None]:
# Loading the compiled seq2seq model from module
model = get_model(QSN_VOCAB_SIZE, ANS_VOCAB_SIZE, EMBEDDING_SIZE, LSTM_UNITS, MAXLEN, BATCH_SIZE, \
                           SCORE_FUN, qsn_embedding_matrix, ans_embedding_matrix)

In [None]:
# Load best model's weights from disk
model.load_weights(MODEL_WEIGHT_PATH)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f914c544438>

## Final func 1

In [None]:
def final_fun_1(X, model, enc_tokenizer, dec_tokenizer, seq_len , sym_spell):
    """
    This function includes entire pipeline, from data preprocessing to making final predictions.
    It takes in raw data as input. It returns predictions for given inputs.
    Here the input can be a single point or a set of points.
    
    Arguments:
        X: Raw data, list of strings
        model: Best model
        enc_tokenizer: Quesion Tokenizer object
        dec_tokenizer: Answer Tokenizer object
        seq_len: Maximum length of any sequence
        sym_spell: Symspell object after loading with proper vocab
    """
    # If input is string, making it as list of strings
    if isinstance(X, str):
        X = [X]

    # Cleaning raw data
    input_sentences = clean_text(X)
    # Preprocessing, spell correction
    input_sentences = preprocess(input_sentences, sym_spell, seq_len)
    
    pred_answers = []  # List to store model predictions
    for sent in input_sentences:
        # Using beam search from module Interference
        # Inside the function performing tokenization and padding
        pred = beam_predict(sent, model, enc_tokenizer, dec_tokenizer, seq_len)
        
        pred_answers.append(pred)

    return pred_answers

In [None]:
# Taking 10 random raw datapoints
sample = raw_data.sample(10)[['question', 'answer']]

In [None]:
sample

Unnamed: 0,question,answer
628361,@ChaseSupport Face ID works and then after a f...,@638968 Thank you for reaching out to us. We w...
582187,@Ask_Spectrum why is kyle Texas internet down ...,@603070 Thank you for reaching out. I am sorry...
450276,@Delta I like to call out good customer servic...,"@498254 Hey, so happy to know we're exceeding ..."
697111,Pitiful chicken and avocado sarnie from @sains...,"@248554 Sorry Joy, can you tell me the barcode..."
476400,Estoy muy molesto con @116875... me mandaron m...,"@193933 Por favor, envíanos la información uti..."
727293,Shout out to @SouthwestAir for giving me my fl...,"@289561 We hope the surgery goes well, Jade! H..."
529275,@115879 do you guys have Lyft passes??? I lite...,@222938 If you did not receive it this time ar...
461094,@O2 could you tell me expected delivery times ...,"@123248 Hi, the expected delivery time was mor..."
260938,@hulu_support When casting the World Series to...,@347763 Yikes! Are you streaming from iOS or A...
550114,My iPhone before iOS11: Running smoothly &amp;...,@577753 Let's work together to resolve the beh...


In [None]:
# Input is a single data point
final_fun_1(sample['question'].values[1], model, enc_tokenizer, dec_tokenizer, MAXLEN, sym_spell)

['i apologize for the inconvenience i will be glad to assist you can you do me your name and acct or phone job']

In [None]:
# Input is set of data points
final_fun_1(sample['question'].values, model, enc_tokenizer, dec_tokenizer, MAXLEN, sym_spell)

['we would like to help you with your iphone please do us and we will get started',
 'i apologize for the inconvenience i will be glad to assist you can you do me your name and acct or phone job',
 'we are so glad to hear that you had a great flight with us we are glad to hear that you had a great flight',
 'i there sorry about this could you send me a pic of the bar code please which store did you buy these from robbie',
 'hold lament mos al inconvenience for favor sig enos in twitter para is instructions in message director',
 'hey there we would like to look into this please do us the phone number on your account',
 'i am sorry for the delay we are working hard to get your order working as quickly as possible as possible please do not provide your order details as we consider them to be personal information our page',
 'sorry for the trouble we are actively working to resolve these issues now we appreciate your patience',
 'we would like to help with your battery issue please do us 

In [None]:
# Use it as interactive chatting

user_inp = input("User: ")
while user_inp.lower() != 'q':
    print("Bot :", final_fun_1(user_inp, model, enc_tokenizer, dec_tokenizer, MAXLEN, sym_spell)[0])
    user_inp = input("User: ")

User: Hello sir need help
Bot : we are here to help please do us your email address and more details so we can follow up
User: I want to cancel my booking
Bot : i there we are sorry you are having problems booking please do us your booking reference full name and email address we will take a look
User: that's good. Thank you :)
Bot : we are so glad you enjoyed your flight with us today
User: q


## Final func 2

In [None]:
def final_fun_2(X, Y, model, enc_tokenizer, dec_tokenizer, seq_len, sym_spell, batch_size=128):
    """
    This function includes entire pipeline, from data preprocessing to making final predictions.
    It takes in raw data as input along with its target values.
    Returns the metric value that is BLEU score on X and Y

    Arguments:
        X: Raw input data
        Y: Raw target data
        enc_tokenizer: Question tokenizer
        dec_tokenizer: Answer tokenizer
        seq_len: Max length of input and output
        sym_spell: symspell object loaded with vocab
        batch_size: batch size value, to be used while predicting in batches
    """
    # Cleaning raw input and target data
    input_sentences = clean_text(X)
    target_sentences = clean_text(Y)
    
    # Spelling correction and preprocessing
    input_sentences, target_sentences = preprocess(input_sentences, sym_spell, seq_len, target_sentences)

    # Predicting in batches
    start = 0
    pred_results = []  # Predicted result by model
    while start < len(input_sentences):
        batch_result = predict(input_sentences[start:start+batch_size], model, enc_tokenizer, dec_tokenizer, seq_len)
        pred_results.extend(batch_result)

        start += batch_size

    # Metric value, here using BLEU score
    model_bleu = []
    for pred, true in zip(pred_results, target_sentences):
        original = [true.split(),]  # target sentence
        translated = pred.split()  # Model predicted sentence
        # Getting BLEU score for a sent
        bleu_score = bleu.sentence_bleu(original, translated)
        model_bleu.append(bleu_score)
    
    # Average BLEU score of given data
    metric_value = (sum(model_bleu) / len(model_bleu))

    return metric_value

In [None]:
# Taking 10,000 random samples
sample = raw_data.sample(10000)[['question', 'answer']]

In [None]:
# Metric value for the given data
final_fun_2(sample['question'].values, sample['answer'].values, model, enc_tokenizer, dec_tokenizer, MAXLEN, sym_spell)

0.3983806167597698