<a href="https://colab.research.google.com/github/SandeshRangreji/Pointer-Generator-Networks/blob/main/DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import tensorflow_datasets as tfds
import tensorflow as tf
import re

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# class to handle data loading, splitting, preprocessing, tokenization
class Data:

  def __init__(self):
    # dictionary for contractions
    self.contractions = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

  # function to perform contractions
  def split_contractions(self, sentence):
    # parameters:
    # sentence: article/summary (string)
    # making a list of words from the article/summary
    # return: article/summary after contractions (string)
    li_sentence = sentence.split(' ')
    # iterating through each word and replacing the contracted word if it is present in contraction dictionary
    for i in range(len(li_sentence)):
      li_sentence[i] = self.contractions.get(li_sentence[i], li_sentence[i])
    # combining the list to form a string again
    sentence = ' '.join(li_sentence)
    return sentence

  # function to handle preprocessing of articles and summaries
  def preprocess(self, sentence):
    # parameters:
    # sentence: article or summary to be processed
    # returns:
    # sentence: cleaned article/summary
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    sentence = self.split_contractions(sentence)
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
    # removing trailing spaces
    sentence = sentence.strip()
    return sentence  
  
  # splitting data into articles and summaries for training and testing
  def split_data(self, dataset):
    # parameters:
    # dataset : tfds of cnn_dailymail dataset (bytes)
    # returns:
    # train_articles, train_summaries, eval_articles, eval_summaries : lists of training and eval articles and summaries (string)
    train_articles = []
    train_summaries = []
    eval_articles = []
    eval_summaries = []
    # iterating through train dataset and storing articles and summaries seperately
    for text in tfds.as_numpy(dataset['train']):
      # decoding from bytes to string
      article = self.preprocess(text['article'].decode("utf-8"))
      summaries = self.preprocess(text['highlights'].decode("utf-8"))
      train_articles.append(article)
      train_summaries.append(summaries)
    # iterating through validation dataset and storing articles and summaries seperately
    for text in tfds.as_numpy(dataset['validation']):
      # decoding from bytes to string
      article = self.preprocess(text['article'].decode("utf-8"))
      summaries = self.preprocess(text['highlights'].decode("utf-8"))
      eval_articles.append(article)
      eval_summaries.append(summaries)
    return train_articles, train_summaries, eval_articles, eval_summaries

  # function to tokenize data
  def tokenize(self, train_articles, train_summaries, eval_articles, eval_summaries, vocab_size , embedding_dim, max_length_articles, max_length_summaries, truncating_type, padding_type, oov_token):
    # parameters:
    # train_articles, train_summaries, eval_articles, eval_summaries : lists of training and eval articles and summaries (string)
    # vocab_size: size of vocabulary
    # embedding_dim: dimensions of word embeddings
    # max_lengths_articles: number of words in the longest article
    # max_lengths_summaries: number of words in the longest summary
    # truncating_type: pre/post truncatation
    # padding_type: pro/post padding
    # oov_token: specifies what oov_token should be used
    # return:
    # train_articles, train_summaries, eval_articles, eval_summaries : lists of training and eval articles and summaries (sequences)
    # initialize tokenizer
    tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
    # fit tokenizer on training input (vocab)
    tokenizer.fit_on_texts(train_articles)
    # get word index from tokenizer
    word_index = tokenizer.word_index
    # tokenize articles for training 
    train_articles = tokenizer.texts_to_sequences(train_articles)
    train_articles = pad_sequences(train_articles ,maxlen=max_length_articles, padding=padding_type, truncating=truncating_type)
    # tokenize summaries for training 
    eval_articles = tokenizer.texts_to_sequences(eval_articles)
    eval_articles = pad_sequences(eval_articles, maxlen=max_length_articles, padding=padding_type, truncating=truncating_type)
    # tokenize articles for eval 
    train_summaries = tokenizer.texts_to_sequences(train_summaries)
    train_summaries = pad_sequences(train_summaries ,maxlen=max_length_summaries, padding=padding_type, truncating=truncating_type)
    # tokenize summaries for eval 
    eval_summaries = tokenizer.texts_to_sequences(eval_summaries)
    eval_summaries = pad_sequences(eval_summaries, maxlen=max_length_summaries, padding=padding_type, truncating=truncating_type)
    return train_articles, train_summaries, eval_articles, eval_summaries, word_index

  # function to get a dictionery which is the reverse of the word_index
  def get_reverse_word_index(self, word_index):
    # parameters:
    # word_index : { word : id }
    # returns
    # reverse_word_index : { id : word }
    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
    return reverse_word_index

  # get important metrics of the dataset that is needed to build a model
  def get_data_metrics(self):
    # return:
    # average length of articles, average length of summaries, length of longest article, length of longest summary
    # loading dataset
    ds = tfds.load("cnn_dailymail")
    # decoding and splitting dataset into train and eval articles and summaries
    train_articles, train_summaries, eval_articles, eval_summaries = self.split_data(ds)
    train_article_sum = 0
    max_article_len = 0
    max_summary_len = 0
    train_summaries_sum = 0
    # iterating through dataset to count total number of words
    for i in range(len(train_articles)):
      # current article and summary length (no. of words)
      article_len = len(train_articles[i].split())
      summary_len = len(train_summaries[i].split())
      # finding length of article with most number of words
      if(article_len>max_article_len):
        max_article_len = article_len
      # finding length of summary with most number of words
      if(summary_len>max_summary_len):
        max_summary_len = summary_len
      # calculating total number of words accross all articles and summary to calculate average
      train_article_sum = train_article_sum + article_len
      train_summaries_sum = train_summaries_sum + summary_len
    return train_article_sum/len(train_articles), train_summaries_sum/len(train_summaries), max_article_len, max_summary_len 

  # function to load pretrained word embeddings and prepare them for embedding layer
  def get_word_embeddings(self, word_index, vocab_size, embedding_dim):
    # parameters:
    # word_index: { word:id }
    # vocab_size: size of vocabulary
    # embedding_dim: dimension of word embeddings
    embeddings_index = {}
    # opening and reading word embeddings from file
    with open('/content/PGN/data/glove.6B.200d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embeddings_matrix = np.zeros((vocab_size+1, embedding_dim))

    # converting word embeddings to matrix (weights for embedding layer) using word_index
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector
    return embeddings_index, embeddings_matrix

  # function to convert list of articles and summaries to iterable, batched datasets
  def batch_datasets(self, train_articles, train_summaries, eval_articles, eval_summaries, BATCH_SIZE):
    # parameters:
    # train_articles, train_summaries, eval_articles, eval_summaries : lists of training and eval articles and summaries (sequences)
    # BATCH_SIZE: size of one batch in the dataset
    # return:
    # train_dataset, val_dataset: dataset objects for training and evaluation, batched according to BATCH_SIZE
    # making a dataset object from the train articles and summaries
    train_dataset = tf.data.Dataset.from_tensor_slices((train_articles, train_summaries))
    # batching training dataset
    train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)
    # making a dataset object from the evaluation articles and summaries
    val_dataset = tf.data.Dataset.from_tensor_slices((eval_articles, eval_summaries))
    # batching evaluation dataset
    val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)
    return train_dataset, val_dataset

  def __call__(self, vocab_size = 25000, embedding_dim = 200, max_length_articles = 2880, max_length_summaries = 1344, truncating_type='post', padding_type='post', oov_token='<OOV>', BATCH_SIZE=64):
    # parameters:
    # vocab_size: size of vocabulary
    # embedding_dim: dimensions of word embeddings
    # max_lengths_articles: number of words in the longest article
    # max_lengths_summaries: number of words in the longest summary
    # truncating_type: pre/post truncatation
    # padding_type: pro/post padding
    # oov_token: specifies what oov_token should be used
    # returns:
    # train_dataset, val_dataset: dataset objects for training and evaluation, batched according to BATCH_SIZE
    # word_index : { word : id }
    # reverse_word_index : { id : word }

    # loading data in bytes from tfds
    ds=tfds.load("cnn_dailymail")
    # splitting data into train and test sets of articles and summaries
    train_articles, train_summaries, eval_articles, eval_summaries = self.split_data(ds)
    # tokenizing data
    train_articles, train_summaries, eval_articles, eval_summaries, word_index = self.tokenize(
        train_articles[:500], 
        train_summaries[:500], 
        eval_articles[:500], 
        eval_summaries[:500],
        vocab_size , 
        embedding_dim, 
        max_length_articles, 
        max_length_summaries, 
        truncating_type, 
        padding_type, 
        oov_token)
    # converting list of articles to train and evaluation datasets that are in batches
    train_dataset, validation_dataset = self.batch_datasets(train_articles, train_summaries, eval_articles, eval_summaries, BATCH_SIZE)
    return train_dataset, validation_dataset, word_index, self.get_reverse_word_index(word_index)

In [13]:
data = Data()
vocab_size = 25000
embedding_dim = 200
max_length_articles = 2880
max_length_summaries = 1344
truncating_type ='post'
padding_type ='post'
oov_token = "<OOV>"
BATCH_SIZE = 64
train_dataset, val_dataset, word_index, reverse_word_index = data(
        vocab_size, 
        embedding_dim, 
        max_length_articles, 
        max_length_summaries, 
        truncating_type, 
        padding_type, 
        oov_token,
        BATCH_SIZE)
embeddings_index, embeddings_matrix = data.get_word_embeddings(word_index, vocab_size, embedding_dim)

In [42]:
avg_train_article_words, avg_train_sum_words, max_article_len, max_summary_len = data.get_data_metrics()

In [43]:
avg_train_article_words

760.9151936693914

In [44]:
avg_train_sum_words

53.612100462187364

In [45]:
max_article_len

2848

In [46]:
max_summary_len

1319