# Data Pre-processing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
TOSAVE = './preprocessed/'

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import random as rn
import re
import pickle
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Softmax
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
# Load data from disk
data = pickle.load(open('clean_conv.pkl', 'rb'))

In [None]:
data.head()

Unnamed: 0,author_id_x,created_at_x,question,author_id_y,created_at_y,answer,clean_question,clean_answer
251468,340004,Sat Oct 21 13:37:43 +0000 2017,@122172 need help,ATVIAssist,Sat Oct 21 14:21:21 +0000 2017,@340004 Hi. What's your platform &amp; which c...,need help,hi what is your platform which contract are yo...
700147,693750,Tue Nov 14 16:46:49 +0000 2017,@115911 @TMobileHelp @115725 @ATT @115911 why ...,TMobileHelp,Tue Nov 14 16:50:00 +0000 2017,@693750 Hey thanks so much for reaching out! Y...,why did you stop allowing conference calling w...,hey thanks so much for reaching out you are a ...
185503,284249,Thu Nov 23 13:33:12 +0000 2017,Oh course @115955 WiFi would go down on Thanks...,Ask_Spectrum,Thu Nov 23 13:48:06 +0000 2017,@284249 I'm sorry to hear about the service is...,oh course wifi would go down on thanksgiving,i am sorry to hear about the service issue i c...
870280,820001,Thu Nov 30 01:17:45 +0000 2017,When the vinaigrette from @ChipotleTweets is 1...,ChipotleTweets,Thu Nov 30 01:21:30 +0000 2017,@820001 Sorry to hear that. Did you let a mana...,when the vinaigrette from is saltier than it i...,sorry to hear that did you let a manager know ...
424733,476849,Sat Nov 04 00:12:43 +0000 2017,PISSED OFF THAT @115888 FUCKING CHARGED ME FUL...,SpotifyCares,Sat Nov 04 01:28:45 +0000 2017,@476849 Hi! We've just sent a DM your way. Let...,pissed off that fucking charged me full price ...,hi we have just sent a dm your way let is carr...


## Spelling correction

In [None]:
!pip install symspellpy

Collecting symspellpy
[?25l  Downloading https://files.pythonhosted.org/packages/99/af/e71fcca6a42b6a63f518b0c1627e1f67822815cb0cf71e6af05acbd75c78/symspellpy-6.7.0-py3-none-any.whl (2.6MB)
[K     |████████████████████████████████| 2.6MB 2.9MB/s 
Installing collected packages: symspellpy
Successfully installed symspellpy-6.7.0


In [None]:
# Using symspell to correct spelling
import pkg_resources
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")

sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

# utility function
def correct_spellings(text):
    " For a given sentence this function returns a sentence after correctecting spelling of words "
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)

    return suggestions[0]._term

In [None]:
correct_spellings(data['clean_question'].iloc[127])

'why is it every time the bruins game goes to intermission the announcers sound like they are underwater watching on'

In [None]:
tqdm.pandas()

# Question's spelling correction
data['clean_question'] = data['clean_question'].progress_apply(correct_spellings)

100%|██████████| 794299/794299 [2:07:55<00:00, 103.48it/s]


In [None]:
# Answer's spelling correction
data['clean_answer'] = data['clean_answer'].progress_apply(correct_spellings)

100%|██████████| 794299/794299 [2:06:00<00:00, 105.06it/s]


In [None]:
# Saving to disk, spell corrected data
pickle.dump(data, open(TOSAVE+'data_spell_corrected.pkl', 'wb'))

In [None]:
data = pickle.load(open(TOSAVE+'data_spell_corrected.pkl', 'rb'))

In [None]:
data.shape

(794299, 8)

In [None]:
# Counting frequency of wirds in questions
from collections import Counter
qsn_cnt = Counter()
for text in data['clean_question'].values:
    for word in text.split():
        qsn_cnt[word] += 1

qsn_dict = dict(qsn_cnt)

len(qsn_dict.keys())

54072

In [None]:
# Frequency of words in answers
from collections import Counter
ans_cnt = Counter()
for text in data['clean_answer'].values:
    for word in text.split():
        ans_cnt[word] += 1

ans_dict = dict(ans_cnt)

len(ans_dict.keys())

28872

In [None]:
x = np.array(list(qsn_dict.values()))
y = np.array(list(ans_dict.values()))

In [None]:
# Quantile of questions
np.percentile(x, [25, 50, 75])

array([ 1.,  4., 20.])

In [None]:
# quantile of answers
np.percentile(y, [25, 50, 75])

array([ 1.,  5., 30.])

In [None]:
print(np.percentile(x, [99+i/10 for i in range(10)]))

print(np.percentile(y, [99+i/10 for i in range(10)]))

[ 3388.9    3882.805  4467.     5074.647  5994.574  7414.16   9492.072
 12127.812 17905.604 39740.398]
[  5287.35         6069.864        7581.288        9018.198
  10864.836       14090.77        18877.956       25694.199
  48864.832      106463.22000005]


In [None]:
# not removing anything from train as vocab size reduced after spelling correction

In [None]:
def remove_rare_words(sent, min_count, cnt_dict):
    """ To remove rare words from texts """
    new_sent = ''
    for word in sent.split():
        try:
            if cnt_dict[word]>min_count:
                new_sent += word+' '
            else:
                new_sent += '<unk>'+' '
        except:
            new_sent += '<unk> '
    return new_sent.strip()

In [None]:
remove_rare_words("hello subrata", 2, qsn_dict)

'hello <unk>'

In [None]:
tqdm.pandas()

In [None]:
# Removing rare words from texts

#data['clean_question'] = data['clean_question'].progress_apply(remove_rare_words, args=(1, qsn_dict))
#data['clean_answer'] = data['clean_answer'].apply(remove_rare_words, args=(2, ans_dict))

## Data restructuring for Encoder and Decoder

In [None]:
# Adding start and end tookens to decoder input
data['clean_answer'] = '<start> ' + data['clean_answer'].astype(str) + ' <end>'

In [None]:
data['qsn_len'] = data['clean_question'].apply(lambda x: len(x.split()))
data['ans_len'] = data['clean_answer'].apply(lambda x: len(x.split()))

In [None]:
data.head()

Unnamed: 0,author_id_x,created_at_x,question,author_id_y,created_at_y,answer,clean_question,clean_answer,qsn_len,ans_len
251468,340004,Sat Oct 21 13:37:43 +0000 2017,@122172 need help,ATVIAssist,Sat Oct 21 14:21:21 +0000 2017,@340004 Hi. What's your platform &amp; which c...,need help,<start> i what is your platform which contract...,2,22
700147,693750,Tue Nov 14 16:46:49 +0000 2017,@115911 @TMobileHelp @115725 @ATT @115911 why ...,TMobileHelp,Tue Nov 14 16:50:00 +0000 2017,@693750 Hey thanks so much for reaching out! Y...,why did you stop allowing conference calling w...,<start> hey thanks so much for reaching out yo...,16,20
185503,284249,Thu Nov 23 13:33:12 +0000 2017,Oh course @115955 WiFi would go down on Thanks...,Ask_Spectrum,Thu Nov 23 13:48:06 +0000 2017,@284249 I'm sorry to hear about the service is...,of course wifi would go down on thanksgiving,<start> i am sorry to hear about the service i...,8,31
870280,820001,Thu Nov 30 01:17:45 +0000 2017,When the vinaigrette from @ChipotleTweets is 1...,ChipotleTweets,Thu Nov 30 01:21:30 +0000 2017,@820001 Sorry to hear that. Did you let a mana...,when the vinaigrette from is saltier than it i...,<start> sorry to hear that did you let a manag...,18,13
424733,476849,Sat Nov 04 00:12:43 +0000 2017,PISSED OFF THAT @115888 FUCKING CHARGED ME FUL...,SpotifyCares,Sat Nov 04 01:28:45 +0000 2017,@476849 Hi! We've just sent a DM your way. Let...,pissed off that fucking charged me full price ...,<start> i we have just sent a do your way let ...,21,18


In [None]:
# taking 95% value

In [None]:
# Checking percentile values of no of words
np.percentile(data['qsn_len'].values, [25, 50, 75, 90, 95, 99])

array([12., 19., 24., 31., 39., 49.])

In [None]:
np.percentile(data['ans_len'].values, [25, 50, 75, 90, 95, 99])

array([17., 22., 26., 31., 37., 48.])

## Parameters

In [None]:
# Parameters

# fixing numpy RS
np.random.seed(42)
# fixing tensorflow RS
tf.random.set_seed(32)
# python RS
rn.seed(12)

# Taking maximum words 38
MAXLEN = 39
QSN_VOCAB_SIZE = 46789
ANS_VOCAB_SIZE = 25445

EMBEDDING_SIZE = 300

In [None]:
# Taking data less and equal 39 words
data = data[(data['qsn_len']>2) & (data['qsn_len'] <= MAXLEN)]
data = data[(data['ans_len']>2) & (data['ans_len'] <= MAXLEN)]

# Decoder outpu data
data['answer_out'] = data['clean_answer'].apply(lambda x: " ".join(x.split()[1:]))

# Selecting necessary columns
data = data[['clean_question', 'clean_answer', 'answer_out']].copy()
data.rename(columns={'clean_question':'question', 'clean_answer':'answer_inp'}, inplace=True)

data.head()

Unnamed: 0,question,answer_inp,answer_out
700147,why did you stop allowing conference calling w...,<start> hey thanks so much for reaching out yo...,hey thanks so much for reaching out you are a ...
185503,of course wifi would go down on thanksgiving,<start> i am sorry to hear about the service i...,i am sorry to hear about the service issue i c...
870280,when the vinaigrette from is saltier than it i...,<start> sorry to hear that did you let a manag...,sorry to hear that did you let a manager know ...
424733,pissed off that fucking charged me full price ...,<start> i we have just sent a do your way let ...,i we have just sent a do your way let is carry...
327118,please help i changed my last name due to marr...,<start> hey there do us a your rapid rewards n...,hey there do us a your rapid rewards number an...


In [None]:
# Checking results
data['answer_inp'].iloc[10]

'<start> that is what i like to see thanks for sharing have a great flight sean <end>'

In [None]:
data['answer_out'].iloc[10]

'that is what i like to see thanks for sharing have a great flight sean <end>'

## Train test split

In [None]:
# Train and validation split
from sklearn.model_selection import train_test_split
train, validation = train_test_split(data, test_size=0.25, random_state=16)

train.shape, validation.shape

((533547, 3), (177849, 3))

In [None]:
# Saving to disk
pickle.dump((train, validation), open(TOSAVE+'spelldata_train_val.pkl', 'wb'))

In [None]:
train, validation = pickle.load(open(TOSAVE+'spelldata_train_val.pkl', 'rb'))

In [None]:
train.head()

Unnamed: 0,question,answer_inp,answer_out
718509,at is awful sizes are incorrect coffee is old ...,<start> sorry to hear this please email us at ...,sorry to hear this please email us at email so...
714919,i just signed up and hope to use your service ...,<start> here to help send us a note here and o...,here to help send us a note here and our team ...
656115,i have been asking about my refund for weeks n...,<start> we will have this resolved as soon as ...,we will have this resolved as soon as possible...
565151,i need my package why out of all packages mine...,<start> i am sorry to hear your package has be...,i am sorry to hear your package has been delay...
693937,i azure team is this service issue completely ...,<start> yes you should not be experiencing the...,yes you should not be experiencing these issue...


## Tokenizing

In [None]:
# Tokenizers
enc_tokenizer = Tokenizer(filters='', oov_token='<unk>')
enc_tokenizer.fit_on_texts(train['question'].values)

dec_tokenizer = Tokenizer(filters='', oov_token='<unk>')
dec_tokenizer.fit_on_texts(train['answer_inp'].values)

In [None]:
pickle.dump((enc_tokenizer, dec_tokenizer), open(TOSAVE+'spelltokenizer_obj.pkl', 'wb'))

In [None]:
# Load tokenizers
enc_tokenizer, dec_tokenizer = pickle.load(open(TOSAVE+'spelltokenizer_obj.pkl', 'rb'))

In [None]:
# Vocab sizes
vocab_size_qsn = len(enc_tokenizer.word_index.keys())
vocab_size_ans = len(dec_tokenizer.word_index.keys())

print(vocab_size_qsn, vocab_size_ans)

46789 25445


In [None]:
# Checking results
dec_tokenizer.word_index['<unk>'], dec_tokenizer.word_index['<start>'], dec_tokenizer.word_index['<end>']

(1, 2, 3)

In [None]:
enc_tokenizer.word_index['<unk>']

1

In [None]:
# Tokenizing the entire train data

train_encoder_inp = enc_tokenizer.texts_to_sequences(train['question'])
train_encoder_inp = pad_sequences(train_encoder_inp, maxlen=MAXLEN, padding='post')

train_decoder_inp = dec_tokenizer.texts_to_sequences(train['answer_inp'])
train_decoder_inp = pad_sequences(train_decoder_inp, maxlen=MAXLEN, padding='post')

train_decoder_out = dec_tokenizer.texts_to_sequences(train['answer_out'])
train_decoder_out = pad_sequences(train_decoder_out, maxlen=MAXLEN, padding='post')

In [None]:
train_encoder_inp.shape, train_decoder_inp.shape, train_decoder_out.shape

((533547, 39), (533547, 39), (533547, 39))

In [None]:
# Tokenizing validation data, using tokenizer trained on train data

val_encoder_inp = enc_tokenizer.texts_to_sequences(validation['question'])
val_encoder_inp = pad_sequences(val_encoder_inp, maxlen=MAXLEN, padding='post')

val_decoder_inp = dec_tokenizer.texts_to_sequences(validation['answer_inp'])
val_decoder_inp = pad_sequences(val_decoder_inp, maxlen=MAXLEN, padding='post')

val_decoder_out = dec_tokenizer.texts_to_sequences(validation['answer_out'])
val_decoder_out = pad_sequences(val_decoder_out, maxlen=MAXLEN, padding='post')

In [None]:
val_encoder_inp.shape, val_decoder_inp.shape, val_decoder_out.shape

((177849, 39), (177849, 39), (177849, 39))

In [None]:
# Saving to disk
pickle.dump((train_encoder_inp, train_decoder_inp, train_decoder_out), open(TOSAVE+'spelltrain_tokens.pkl', 'wb'))
pickle.dump((val_encoder_inp, val_decoder_inp, val_decoder_out), open(TOSAVE+'spellval_token.pkl', 'wb'))

In [None]:
# Load data
train_encoder_inp, train_decoder_inp, train_decoder_out = pickle.load(open('train_tokens.pkl', 'rb'))
val_encoder_inp, val_decoder_inp, val_decoder_out = pickle.load(open('val_token.pkl', 'rb'))

In [None]:
train_encoder_inp.shape, train_decoder_inp.shape, train_decoder_out.shape

((530173, 38), (530173, 38), (530173, 38))

In [None]:
val_encoder_inp.shape, val_decoder_inp.shape, val_decoder_out.shape

((176725, 38), (176725, 38), (176725, 38))

## Pre trained Embedding matrix

In [None]:
!pip install fasttext



In [None]:
import fasttext
import fasttext.util

fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz





In [None]:
#ft = fasttext.load_model('cc.en.300.bin')
ft.get_dimension()

300

In [None]:
ft.get_word_vector('hello').shape

(300,)

In [None]:
# Convertering question vocab to glove vectors
qsn_embedding_matrix = np.zeros((QSN_VOCAB_SIZE+1, 300))
for word, i in enc_tokenizer.word_index.items():
    embedding_vector = ft.get_word_vector(word)
    if embedding_vector is not None:
        qsn_embedding_matrix[i] = embedding_vector

In [None]:
qsn_embedding_matrix.shape

(46790, 300)

In [None]:
# Converting answer vocab to glove vectors
ans_embedding_matrix = np.zeros((ANS_VOCAB_SIZE+1, 300))
for word, i in dec_tokenizer.word_index.items():
    embedding_vector = ft.get_word_vector(word)
    if embedding_vector is not None:
        ans_embedding_matrix[i] = embedding_vector

In [None]:
ans_embedding_matrix.shape

(25446, 300)

In [None]:
pickle.dump((qsn_embedding_matrix, ans_embedding_matrix), open(TOSAVE+'spellembedding_matrix.pkl', 'wb'))

In [None]:
# Load embedding
qsn_embedding_matrix1, ans_embedding_matrix1 = pickle.load(open(TOSAVE+'spellembedding_matrix.pkl', 'rb'))

In [None]:
qsn_embedding_matrix1.shape, ans_embedding_matrix1.shape

((46790, 300), (25446, 300))