In [1]:
#Import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import email
import json
import pickle
import os
import re
import nltk
import joblib
import time
import multiprocessing
from tqdm import tqdm
from zipfile import ZipFile
import tensorflow as tf

<h1 style="font-size:35px"><center>Word Featurization

<h1>1. Load sentences</h1>
<h2>(Refer Section 5 of 1_Sentence_E.ipynb)</h2> 

In [2]:
train = pd.read_csv('train_preprocessed_data.csv')
validate = pd.read_csv('val_preprocessed_data.csv')

<h1>2. Data Featurization

In [3]:
def generate_input_and_target(text):
    """
    Generates the input and target features from a single sentence
    """
    X = []
    y = []
    a = text.split()
    for i in range(1,len(a)):
        X.append(' '.join(a[:i]))
        y.append(' '.join(a[i:]))

    #Convert to dataframe
    b = pd.DataFrame(data={'X':X,'y':y})
    #Prepare input sequence to model, add start(#) and end tokens(*)
    b['X'] = '<start> ' + b['X'] + ' <end>'
    b['y'] = '<start> ' + b['y'] + ' <end>'
    
    return b

In [4]:
#Generate data for all samples
def generate_data_for_model(data_df):
    
    """
    Generates data for input to model
    """
    for i in tqdm(range(data_df.shape[0])):
        if i==0:
            
            final_df = generate_input_and_target(data_df.sent.iloc[0])
        else:
            final_df = pd.concat([final_df,generate_input_and_target(data_df.sent.iloc[i])],ignore_index=True)
    final_df = final_df.drop_duplicates()
    return final_df

In [5]:
! python train_data_featurization.py

100%|██████████| 5228/5228 [01:39<00:00, 52.33it/s]
100%|██████████| 5228/5228 [01:40<00:00, 52.11it/s]
100%|██████████| 5229/5229 [01:40<00:00, 52.01it/s]
100%|██████████| 5229/5229 [01:40<00:00, 51.99it/s]
100%|██████████| 5229/5229 [01:40<00:00, 51.92it/s]
100%|██████████| 5228/5228 [01:40<00:00, 51.88it/s]
100%|██████████| 5229/5229 [01:40<00:00, 51.81it/s]
100%|██████████| 5228/5228 [01:40<00:00, 51.82it/s]
Time taken: 101.16 seconds


In [6]:
#Concatenate all train data which is featurized
a = pd.read_csv('final_df/train/first.csv')
b = pd.read_csv('final_df/train/second.csv')
c = pd.read_csv('final_df/train/third.csv')
d = pd.read_csv('final_df/train/fourth.csv')
e = pd.read_csv('final_df/train/fifth.csv')
f = pd.read_csv('final_df/train/sixth.csv')
g = pd.read_csv('final_df/train/seventh.csv')
h = pd.read_csv('final_df/train/eighth.csv')

In [7]:
train_final_data = pd.concat([a,b,c,d,e,f,g,h],ignore_index=True).drop_duplicates()

In [8]:
train_final_data.head()

Unnamed: 0,X,y
0,<start> it <end>,<start> looks like we should have them soon <end>
1,<start> it looks <end>,<start> like we should have them soon <end>
2,<start> it looks like <end>,<start> we should have them soon <end>
3,<start> it looks like we <end>,<start> should have them soon <end>
4,<start> it looks like we should <end>,<start> have them soon <end>


In [9]:
train_final_data.shape

(328972, 2)

In [10]:
train_final_data.to_csv('train_final_data.csv',index=False,index_label=True)

In [12]:
! python val_data_featurization.py

100%|██████████| 1307/1307 [00:14<00:00, 92.24it/s]
100%|██████████| 1307/1307 [00:14<00:00, 91.52it/s]
100%|██████████| 1308/1308 [00:14<00:00, 91.37it/s]
100%|██████████| 1307/1307 [00:14<00:00, 90.57it/s]
100%|██████████| 1307/1307 [00:14<00:00, 90.59it/s]
100%|██████████| 1307/1307 [00:14<00:00, 90.19it/s]
100%|██████████| 1307/1307 [00:14<00:00, 90.14it/s]
100%|██████████| 1307/1307 [00:14<00:00, 89.95it/s]
Time taken: 14.62 seconds


In [13]:
#Concatenate all validation data which is featurized
a = pd.read_csv('final_df/val/first.csv')
b = pd.read_csv('final_df/val/second.csv')
c = pd.read_csv('final_df/val/third.csv')
d = pd.read_csv('final_df/val/fourth.csv')
e = pd.read_csv('final_df/val/fifth.csv')
f = pd.read_csv('final_df/val/sixth.csv')
g = pd.read_csv('final_df/val/seventh.csv')
h = pd.read_csv('final_df/val/eighth.csv')

In [14]:
val_final_data = pd.concat([a,b,c,d,e,f,g,h],ignore_index=True).drop_duplicates()

In [15]:
val_final_data.head()

Unnamed: 0,X,y
0,<start> but <end>,<start> again that might be just what the two ...
1,<start> but again <end>,<start> that might be just what the two utilit...
2,<start> but again that <end>,<start> might be just what the two utilities r...
3,<start> but again that might <end>,<start> be just what the two utilities receive...
4,<start> but again that might be <end>,<start> just what the two utilities receive <end>


In [16]:
val_final_data.shape

(82086, 2)

The objective of the model is to predict the next few words given sufficient number of input words. If the number of input words are too small, the predicted output is less likely to make sense. Hence for validation data, the samples whose number of words are less than 5 are dropped.

In [17]:
#Create column which counts number of words in each text
val_final_data['X_length'] = val_final_data['X'].apply(lambda text:len(text.split()))

#Drop samples with words less than 4 
val_final_data = val_final_data[val_final_data['X_length']>4]

In [18]:
val_final_data.head()

Unnamed: 0,X,y,X_length
2,<start> but again that <end>,<start> might be just what the two utilities r...,5
3,<start> but again that might <end>,<start> be just what the two utilities receive...,6
4,<start> but again that might be <end>,<start> just what the two utilities receive <end>,7
5,<start> but again that might be just <end>,<start> what the two utilities receive <end>,8
6,<start> but again that might be just what <end>,<start> the two utilities receive <end>,9


In [19]:
val_final_data.shape

(61183, 3)

In [20]:
val_final_data.to_csv('val_final_data.csv',index=False,index_label=True)

<h2>3. Data Tokenization

In [21]:
train_final_data = pd.read_csv('train_final_data.csv')
val_final_data = pd.read_csv('val_final_data.csv')

In [22]:
inp_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='',oov_token='<unk>')
inp_tokenizer.fit_on_texts(train_final_data.X)

In [23]:
inp_vocab_size = len(inp_tokenizer.word_index) + 1  
inp_vocab_size

1470

In [24]:
out_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='',oov_token='<unk>')
out_tokenizer.fit_on_texts(train_final_data.y)

In [25]:
out_vocab_size = len(out_tokenizer.word_index) + 1  
out_vocab_size

1469

In [26]:
#integer encode the documents
X_train_encoded_docs = inp_tokenizer.texts_to_sequences(train_final_data.X)
y_train_encoded_docs = out_tokenizer.texts_to_sequences(train_final_data.y)

X_val_encoded_docs = inp_tokenizer.texts_to_sequences(val_final_data.X)
y_val_encoded_docs = out_tokenizer.texts_to_sequences(val_final_data.y)

In [27]:
#pad documents of X to max length
X_train_padded_docs = tf.keras.preprocessing.sequence.pad_sequences(X_train_encoded_docs,padding='post')

In [28]:
X_val_padded_docs = tf.keras.preprocessing.sequence.pad_sequences(X_val_encoded_docs,maxlen=X_train_padded_docs.shape[1],
                                                                  padding='post')

In [29]:
print(X_train_padded_docs.shape)
print(X_val_padded_docs.shape)

(328972, 23)
(61183, 23)


In [30]:
#pad documents of y
y_train_padded_docs = tf.keras.preprocessing.sequence.pad_sequences(y_train_encoded_docs,padding='post')

In [31]:
y_val_padded_docs = tf.keras.preprocessing.sequence.pad_sequences(y_val_encoded_docs,maxlen=y_train_padded_docs.shape[1],
                                                                  padding='post')

In [32]:
print(y_train_padded_docs.shape)
print(y_val_padded_docs.shape)

(328972, 23)
(61183, 23)


In [33]:
#Load embedding vectors
#https://nlp.stanford.edu/projects/glove/
embeddings_dict = {}
with open("glove.6B.300d.txt", encoding="utf8") as f:
    for line in tqdm(f):
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

400000it [00:37, 10562.59it/s]


In [34]:
#create a weight matrix for input embedding layer
inp_embedding_matrix = np.zeros((inp_vocab_size, 300))
for word, i in tqdm(inp_tokenizer.word_index.items()):
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        inp_embedding_matrix[i] = embedding_vector

100%|██████████| 1469/1469 [00:00<00:00, 180867.51it/s]


In [35]:
inp_embedding_matrix.shape

(1470, 300)

In [36]:
#create a weight matrix for output embedding layer
out_embedding_matrix = np.zeros((out_vocab_size, 300))
for word, i in tqdm(out_tokenizer.word_index.items()):
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        out_embedding_matrix[i] = embedding_vector

100%|██████████| 1468/1468 [00:00<00:00, 164298.17it/s]


In [37]:
out_embedding_matrix.shape

(1469, 300)

In [38]:
joblib.dump([X_train_padded_docs,y_train_padded_docs,X_val_padded_docs,y_val_padded_docs,
             inp_embedding_matrix,out_embedding_matrix,inp_tokenizer,out_tokenizer],'encoded_data',compress=6)

['encoded_data']

<h1 style="font-size:35px"><center>Character Featurization</b></p>

<h1>1. Load sentences

In [2]:
train = pd.read_csv('train_preprocessed_data.csv',nrows=5000)
validate = pd.read_csv('val_preprocessed_data.csv',nrows=1000)

<h1>2. Data Featurization

In [3]:
def generate_input_and_target(text):
    """
    Generates the input and target features from a single sentence
    """
    X = []
    y = []
    a = list(text)
    for i in range(1,len(a)):
        X.append(''.join(a[:i]))
        y.append(''.join(a[i:]))

    #Convert to dataframe
    b = pd.DataFrame(data={'X':X,'y':y})
    #Prepare input sequence to model, add start(#) and end tokens(*)
    b['X'] = '# ' + b['X'] + ' *'
    b['y'] = '# ' + b['y'] + ' *'
    
    return b

In [4]:
#Generate data for all samples
def generate_data_for_model(data_df):
    
    """
    Generates data for input to model
    """
    for i in tqdm(range(data_df.shape[0])):
        if i==0:
            
            final_df = generate_input_and_target(data_df.sent.iloc[0])
        else:
            final_df = pd.concat([final_df,generate_input_and_target(data_df.sent.iloc[i])],ignore_index=True)
    final_df = final_df.drop_duplicates()
    return final_df

In [15]:
! python train_data_featurization.py

100%|██████████| 625/625 [00:11<00:00, 55.19it/s]
100%|██████████| 625/625 [00:11<00:00, 54.94it/s]
100%|██████████| 625/625 [00:11<00:00, 55.00it/s]
100%|██████████| 625/625 [00:11<00:00, 54.98it/s]
100%|██████████| 625/625 [00:11<00:00, 53.99it/s]
100%|██████████| 625/625 [00:11<00:00, 53.40it/s]
100%|██████████| 625/625 [00:11<00:00, 53.35it/s]
100%|██████████| 625/625 [00:11<00:00, 53.22it/s]
Time taken: 11.93 seconds


In [16]:
#Concatenate all train data which is featurized
a = pd.read_csv('final_df/train/first.csv')
b = pd.read_csv('final_df/train/second.csv')
c = pd.read_csv('final_df/train/third.csv')
d = pd.read_csv('final_df/train/fourth.csv')
e = pd.read_csv('final_df/train/fifth.csv')
f = pd.read_csv('final_df/train/sixth.csv')
g = pd.read_csv('final_df/train/seventh.csv')
h = pd.read_csv('final_df/train/eighth.csv')

In [17]:
train_final_data = pd.concat([a,b,c,d,e,f,g,h],ignore_index=True).drop_duplicates()

In [18]:
train_final_data.head()

Unnamed: 0,X,y
0,# i *,# t looks like we should have them soon *
1,# it *,# looks like we should have them soon *
2,# it *,# looks like we should have them soon *
3,# it l *,# ooks like we should have them soon *
4,# it lo *,# oks like we should have them soon *


In [19]:
train_final_data.shape

(217184, 2)

In [10]:
train_final_data.to_csv('train_final_data.csv',index=False,index_label=True)

In [20]:
! python val_data_featurization.py

100%|██████████| 125/125 [00:01<00:00, 97.77it/s]
100%|██████████| 125/125 [00:01<00:00, 93.19it/s]
100%|██████████| 125/125 [00:01<00:00, 88.50it/s]
100%|██████████| 125/125 [00:01<00:00, 88.81it/s]
100%|██████████| 125/125 [00:01<00:00, 86.12it/s]
100%|██████████| 125/125 [00:01<00:00, 85.71it/s]
100%|██████████| 125/125 [00:01<00:00, 84.43it/s]
100%|██████████| 125/125 [00:01<00:00, 84.36it/s]
Time taken: 1.58 seconds


In [21]:
#Concatenate all validation data which is featurized
a = pd.read_csv('final_df/val/first.csv')
b = pd.read_csv('final_df/val/second.csv')
c = pd.read_csv('final_df/val/third.csv')
d = pd.read_csv('final_df/val/fourth.csv')
e = pd.read_csv('final_df/val/fifth.csv')
f = pd.read_csv('final_df/val/sixth.csv')
g = pd.read_csv('final_df/val/seventh.csv')
h = pd.read_csv('final_df/val/eighth.csv')

In [22]:
val_final_data = pd.concat([a,b,c,d,e,f,g,h],ignore_index=True).drop_duplicates()

In [23]:
val_final_data.head()

Unnamed: 0,X,y
0,# b *,# ut again that might be just what the two uti...
1,# bu *,# t again that might be just what the two util...
2,# but *,# again that might be just what the two utili...
3,# but *,# again that might be just what the two utilit...
4,# but a *,# gain that might be just what the two utiliti...


In [24]:
val_final_data.shape

(41833, 2)

The objective of the model is to predict the next few characters given sufficient number of input characters. If the number of input characters are too small, the predicted output is less likely to make sense. Hence for validation data, the samples whose number of characters are less than 14 are dropped.

In [25]:
#Create column which counts number of characters in each text
val_final_data['X_length'] = val_final_data['X'].apply(lambda text:len(list(text)))

#Drop samples with characters less than 15
val_final_data = val_final_data[val_final_data['X_length']>=15]

In [26]:
val_final_data.head()

Unnamed: 0,X,y,X_length
10,# but again t *,# hat might be just what the two utilities rec...,15
11,# but again th *,# at might be just what the two utilities rece...,16
12,# but again tha *,# t might be just what the two utilities recei...,17
13,# but again that *,# might be just what the two utilities receive *,18
14,# but again that *,# might be just what the two utilities receive *,19


In [27]:
val_final_data.shape

(31853, 3)

In [28]:
val_final_data.to_csv('val_final_data.csv',index=False,index_label=True)

<h2>3. Data Tokenization

In [29]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='',char_level=True)
tokenizer.fit_on_texts(train_final_data.X)

In [30]:
tokenizer.word_index

{' ': 1,
 '#': 12,
 '*': 13,
 'a': 5,
 'b': 25,
 'c': 18,
 'd': 15,
 'e': 2,
 'f': 21,
 'g': 22,
 'h': 10,
 'i': 6,
 'j': 27,
 'k': 23,
 'l': 9,
 'm': 19,
 'n': 7,
 'o': 4,
 'p': 20,
 'q': 26,
 'r': 11,
 's': 8,
 't': 3,
 'u': 14,
 'v': 24,
 'w': 16,
 'x': 28,
 'y': 17,
 'z': 29}

In [31]:
vocab_size = len(tokenizer.word_index) + 1  
vocab_size

30

In [32]:
#integer encode the documents
X_train_encoded_docs = tokenizer.texts_to_sequences(train_final_data.X)
y_train_encoded_docs = tokenizer.texts_to_sequences(train_final_data.y)

X_val_encoded_docs = tokenizer.texts_to_sequences(val_final_data.X)
y_val_encoded_docs = tokenizer.texts_to_sequences(val_final_data.y)

In [33]:
#pad documents of X to max length
X_train_padded_docs = tf.keras.preprocessing.sequence.pad_sequences(X_train_encoded_docs,padding='post')

In [34]:
X_val_padded_docs = tf.keras.preprocessing.sequence.pad_sequences(X_val_encoded_docs,maxlen=X_train_padded_docs.shape[1],
                                                                  padding='post')

In [35]:
print(X_train_padded_docs.shape)
print(X_val_padded_docs.shape)

(217184, 132)
(31853, 132)


In [36]:
#pad documents of y to max length
y_train_padded_docs = tf.keras.preprocessing.sequence.pad_sequences(y_train_encoded_docs,padding='post')

In [37]:
y_val_padded_docs = tf.keras.preprocessing.sequence.pad_sequences(y_val_encoded_docs,maxlen=y_train_padded_docs.shape[1],
                                                                  padding='post')

In [38]:
print(y_train_padded_docs.shape)
print(y_val_padded_docs.shape)

(217184, 132)
(31853, 132)


In [39]:
#Load embedding vectors
embeddings_dict = {}
with open("glove_char_vectors.txt", encoding="utf8") as f:
    for line in tqdm(f):
        values = line.split()
        char = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[char] = vector

94it [00:00, 1291.94it/s]


In [40]:
#create a weight matrix
embedding_matrix = np.zeros((vocab_size, 300))
for char, i in tqdm(tokenizer.word_index.items()):
    embedding_vector = embeddings_dict.get(char)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 29/29 [00:00<00:00, 26586.85it/s]


In [41]:
embedding_matrix.shape

(30, 300)

In [42]:
joblib.dump([X_train_padded_docs,y_train_padded_docs,X_val_padded_docs,y_val_padded_docs,embedding_matrix,
             tokenizer],'char_encoded_data',compress=6)

['char_encoded_data']

<h1 style="font-size:35px"><center>GPT2 Word Featurization</b></p>

<h1>1. Load sentences

In [2]:
train = pd.read_csv('train_preprocessed_data.csv')
validate = pd.read_csv('val_preprocessed_data.csv')

<h1>2. Data Featurization

In [3]:
def generate_input_and_target(text):
    """
    Generates the input and target features from a single sentence
    """
    X = []
    y = []
    a = text.split()
    for i in range(1,len(a)):
        X.append(' '.join(a[:i]))
        y.append(' '.join(a[i:]))

    #Convert to dataframe
    b = pd.DataFrame(data={'X':X,'y':y})
    #Prepare input sequence to model, add start(#) and end tokens(*)
    b['X'] = '<|startoftext|>' + b['X']
    
    return b

In [4]:
#Generate data for all samples
def generate_data_for_model(data_df):
    
    """
    Generates data for input to model
    """
    for i in tqdm(range(data_df.shape[0])):
        if i==0:
            
            final_df = generate_input_and_target(data_df.sent.iloc[0])
        else:
            final_df = pd.concat([final_df,generate_input_and_target(data_df.sent.iloc[i])],ignore_index=True)
    final_df = final_df.drop_duplicates()
    return final_df

In [5]:
! python train_data_featurization.py

100%|██████████| 5228/5228 [01:20<00:00, 65.02it/s] 
100%|██████████| 5228/5228 [01:20<00:00, 65.01it/s]
100%|██████████| 5229/5229 [01:20<00:00, 64.60it/s]
100%|██████████| 5229/5229 [01:21<00:00, 64.46it/s]
100%|██████████| 5229/5229 [01:21<00:00, 64.46it/s]
100%|██████████| 5228/5228 [01:21<00:00, 64.42it/s]
100%|██████████| 5229/5229 [01:21<00:00, 64.09it/s]
100%|██████████| 5228/5228 [01:21<00:00, 64.11it/s]
Time taken: 81.75 seconds


In [6]:
#Concatenate all train data which is featurized
a = pd.read_csv('final_df/train/first.csv')
b = pd.read_csv('final_df/train/second.csv')
c = pd.read_csv('final_df/train/third.csv')
d = pd.read_csv('final_df/train/fourth.csv')
e = pd.read_csv('final_df/train/fifth.csv')
f = pd.read_csv('final_df/train/sixth.csv')
g = pd.read_csv('final_df/train/seventh.csv')
h = pd.read_csv('final_df/train/eighth.csv')

In [7]:
train_final_data = pd.concat([a,b,c,d,e,f,g,h],ignore_index=True).drop_duplicates()

In [8]:
train_final_data.head()

Unnamed: 0,X,y
0,<|startoftext|>it,looks like we should have them soon
1,<|startoftext|>it looks,like we should have them soon
2,<|startoftext|>it looks like,we should have them soon
3,<|startoftext|>it looks like we,should have them soon
4,<|startoftext|>it looks like we should,have them soon


In [9]:
train_final_data.shape

(328972, 2)

In [10]:
train_final_data.to_csv('train_final_data.csv',index=False,index_label=True)

In [11]:
! python val_data_featurization.py

100%|██████████| 1307/1307 [00:09<00:00, 138.57it/s]
100%|██████████| 1308/1308 [00:09<00:00, 138.22it/s]
100%|██████████| 1307/1307 [00:09<00:00, 138.00it/s]
100%|██████████| 1307/1307 [00:09<00:00, 136.81it/s]
100%|██████████| 1307/1307 [00:09<00:00, 136.85it/s]
100%|██████████| 1307/1307 [00:09<00:00, 135.44it/s]
100%|██████████| 1307/1307 [00:09<00:00, 135.60it/s]
100%|██████████| 1307/1307 [00:09<00:00, 135.03it/s]
Time taken: 9.77 seconds


In [12]:
#Concatenate all validation data which is featurized
a = pd.read_csv('final_df/val/first.csv')
b = pd.read_csv('final_df/val/second.csv')
c = pd.read_csv('final_df/val/third.csv')
d = pd.read_csv('final_df/val/fourth.csv')
e = pd.read_csv('final_df/val/fifth.csv')
f = pd.read_csv('final_df/val/sixth.csv')
g = pd.read_csv('final_df/val/seventh.csv')
h = pd.read_csv('final_df/val/eighth.csv')

In [13]:
val_final_data = pd.concat([a,b,c,d,e,f,g,h],ignore_index=True).drop_duplicates()

In [14]:
val_final_data.head()

Unnamed: 0,X,y
0,<|startoftext|>but,again that might be just what the two utilitie...
1,<|startoftext|>but again,that might be just what the two utilities receive
2,<|startoftext|>but again that,might be just what the two utilities receive
3,<|startoftext|>but again that might,be just what the two utilities receive
4,<|startoftext|>but again that might be,just what the two utilities receive


In [15]:
val_final_data.shape

(82086, 2)

The objective of the model is to predict the next few words given sufficient number of input words. If the number of input words are too small, the predicted output is less likely to make sense. Hence for validation data, the samples whose number of words are less than 3 are dropped.

In [16]:
#Create column which counts number of words in each text
val_final_data['X_length'] = val_final_data['X'].apply(lambda text:len(text.split()))

#Drop samples with words less than 2 
val_final_data = val_final_data[val_final_data['X_length']>2]

In [17]:
val_final_data.head()

Unnamed: 0,X,y,X_length
2,<|startoftext|>but again that,might be just what the two utilities receive,3
3,<|startoftext|>but again that might,be just what the two utilities receive,4
4,<|startoftext|>but again that might be,just what the two utilities receive,5
5,<|startoftext|>but again that might be just,what the two utilities receive,6
6,<|startoftext|>but again that might be just what,the two utilities receive,7


In [18]:
val_final_data.shape

(61183, 3)

In [19]:
val_final_data.to_csv('val_final_data.csv',index=False,index_label=True)