<a href="https://colab.research.google.com/github/SandeshRangreji/Pointer-Generator-Networks/blob/main/DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import tensorflow_datasets as tfds
import tensorflow as tf
import re

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# unzips Glove word embeddings
!unrar x "/content/drive/MyDrive/Pointer Generator Networks/glove.6B.200d.rar" -d "/content/PGN/data/"


UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal

Cannot open /content/drive/MyDrive/Pointer Generator Networks/glove.6B.200d.rar
No such file or directory
No files to extract


In [7]:
# class to handle data loading, splitting, preprocessing, tokenization
class Data:

  # function to handle preprocessing of articles and summaries
  def preprocess(self, sentence):
    # parameters:
    # sentence: article or summary to be processed
    # returns:
    # sentence: cleaned article/summary
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
    # removing trailing spaces
    sentence = sentence.strip()
    return sentence
  
  # splitting data into articles and summaries for training and testing
  def split_data(self, dataset):
    # parameters:
    # dataset : tfds of cnn_dailymail dataset (bytes)
    # returns:
    # train_articles, train_summaries, eval_articles, eval_summaries : lists of training and eval articles and summaries (string)
    train_articles = []
    train_summaries = []
    eval_articles = []
    eval_summaries = []
    # iterating through train dataset and storing articles and summaries seperately
    for text in tfds.as_numpy(dataset['train']):
      # decoding from bytes to string
      article = self.preprocess(text['article'].decode("utf-8"))
      summaries = self.preprocess(text['highlights'].decode("utf-8"))
      train_articles.append(article)
      train_summaries.append(summaries)
    # iterating through validation dataset and storing articles and summaries seperately
    for text in tfds.as_numpy(dataset['validation']):
      # decoding from bytes to string
      article = self.preprocess(text['article'].decode("utf-8"))
      summaries = self.preprocess(text['highlights'].decode("utf-8"))
      eval_articles.append(article)
      eval_summaries.append(summaries)
    return train_articles, train_summaries, eval_articles, eval_summaries

  # function to tokenize data
  def tokenize(self, train_articles, train_summaries, eval_articles, eval_summaries, vocab_size , embedding_dim, max_length_articles, max_length_summaries, truncating_type, padding_type, oov_token):
    # parameters:
    # train_articles, train_summaries, eval_articles, eval_summaries : lists of training and eval articles and summaries (string)
    # vocab_size: size of vocabulary
    # embedding_dim: dimensions of word embeddings
    # max_lengths_articles: number of words in the longest article
    # max_lengths_summaries: number of words in the longest summary
    # truncating_type: pre/post truncatation
    # padding_type: pro/post padding
    # oov_token: specifies what oov_token should be used
    # return:
    # train_articles, train_summaries, eval_articles, eval_summaries : lists of training and eval articles and summaries (sequences)
    # initialize tokenizer
    tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
    # fit tokenizer on training input (vocab)
    tokenizer.fit_on_texts(train_articles)
    # get word index from tokenizer
    word_index = tokenizer.word_index
    # tokenize articles for training 
    train_articles = tokenizer.texts_to_sequences(train_articles)
    train_articles = pad_sequences(train_articles ,maxlen=max_length_articles, padding=padding_type, truncating=truncating_type)
    # tokenize summaries for training 
    eval_articles = tokenizer.texts_to_sequences(eval_articles)
    eval_articles = pad_sequences(eval_articles, maxlen=max_length_articles, padding=padding_type, truncating=truncating_type)
    # tokenize articles for eval 
    train_summaries = tokenizer.texts_to_sequences(train_summaries)
    train_summaries = pad_sequences(train_summaries ,maxlen=max_length_articles, padding=padding_type, truncating=truncating_type)
    # tokenize summaries for eval 
    eval_summaries = tokenizer.texts_to_sequences(eval_summaries)
    eval_summaries = pad_sequences(eval_summaries, maxlen=max_length_articles, padding=padding_type, truncating=truncating_type)
    return train_articles, train_summaries, eval_articles, eval_summaries, word_index

  # function to get a dictionery which is the reverse of the word_index
  def get_reverse_word_index(self, word_index):
    # parameters:
    # word_index : { word : id }
    # returns
    # reverse_word_index : { id : word }
    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
    return reverse_word_index

  def __call__(self, vocab_size = 50000, embedding_dim = 200, max_length_articles = 16000, max_length_summaries = 8000, truncating_type='post', padding_type='post', oov_token='<OOV>'):
    # parameters:
    # vocab_size: size of vocabulary
    # embedding_dim: dimensions of word embeddings
    # max_lengths_articles: number of words in the longest article
    # max_lengths_summaries: number of words in the longest summary
    # truncating_type: pre/post truncatation
    # padding_type: pro/post padding
    # oov_token: specifies what oov_token should be used
    # returns:
    # train_articles, train_summaries, eval_articles, eval_summaries : lists of training and eval articles and summaries (sequences)
    # word_index : { word : id }
    # reverse_word_index : { id : word }

    # loading data in bytes from tfds
    ds=tfds.load("cnn_dailymail")
    # splitting data into train and test sets of articles and summaries
    train_articles, train_summaries, eval_articles, eval_summaries = self.split_data(ds)
    # tokenizing data
    train_articles, train_summaries, eval_articles, eval_summaries, word_index = self.tokenize(
        train_articles[:500], 
        train_summaries[:500], 
        eval_articles[:500], 
        eval_summaries[:500],
        vocab_size , 
        embedding_dim, 
        max_length_articles, 
        max_length_summaries, 
        truncating_type, 
        padding_type, 
        oov_token)
    return train_articles, train_summaries, eval_articles, eval_summaries, word_index, self.get_reverse_word_index(word_index)

In [8]:
data = Data()
vocab_size = 10000
embedding_dim = 200
max_length_articles = 16000
max_length_summaries = 8000
truncating_type ='post'
padding_type ='post'
oov_token = "<OOV>"
train_articles, train_summaries, eval_articles, eval_summaries, word_index, reverse_word_index = data(
        vocab_size, 
        embedding_dim, 
        max_length_articles, 
        max_length_summaries, 
        truncating_type, 
        padding_type, 
        oov_token)

In [9]:
len(word_index)

22004

In [10]:
word_index

{'<OOV>': 1,
 'the': 2,
 'to': 3,
 'a': 4,
 'and': 5,
 'of': 6,
 'in': 7,
 's': 8,
 'was': 9,
 'that': 10,
 'for': 11,
 'on': 12,
 'is': 13,
 'he': 14,
 'it': 15,
 'with': 16,
 'said': 17,
 'his': 18,
 'i': 19,
 'as': 20,
 'at': 21,
 'have': 22,
 'by': 23,
 'from': 24,
 'be': 25,
 'has': 26,
 'her': 27,
 'but': 28,
 'they': 29,
 'she': 30,
 'are': 31,
 'an': 32,
 'who': 33,
 'this': 34,
 'had': 35,
 'not': 36,
 'after': 37,
 'we': 38,
 'their': 39,
 'been': 40,
 'were': 41,
 'will': 42,
 't': 43,
 'when': 44,
 'you': 45,
 'one': 46,
 'which': 47,
 'up': 48,
 'about': 49,
 'out': 50,
 'year': 51,
 'more': 52,
 'would': 53,
 'there': 54,
 'also': 55,
 'or': 56,
 'people': 57,
 'can': 58,
 'so': 59,
 'all': 60,
 'two': 61,
 'him': 62,
 'what': 63,
 'if': 64,
 'into': 65,
 'first': 66,
 'new': 67,
 'them': 68,
 'time': 69,
 'just': 70,
 'over': 71,
 'no': 72,
 'police': 73,
 'last': 74,
 'than': 75,
 'told': 76,
 'mr': 77,
 'my': 78,
 'years': 79,
 'before': 80,
 'could': 81,
 'being': 82,

In [11]:
reverse_word_index

{1: '<OOV>',
 2: 'the',
 3: 'to',
 4: 'a',
 5: 'and',
 6: 'of',
 7: 'in',
 8: 's',
 9: 'was',
 10: 'that',
 11: 'for',
 12: 'on',
 13: 'is',
 14: 'he',
 15: 'it',
 16: 'with',
 17: 'said',
 18: 'his',
 19: 'i',
 20: 'as',
 21: 'at',
 22: 'have',
 23: 'by',
 24: 'from',
 25: 'be',
 26: 'has',
 27: 'her',
 28: 'but',
 29: 'they',
 30: 'she',
 31: 'are',
 32: 'an',
 33: 'who',
 34: 'this',
 35: 'had',
 36: 'not',
 37: 'after',
 38: 'we',
 39: 'their',
 40: 'been',
 41: 'were',
 42: 'will',
 43: 't',
 44: 'when',
 45: 'you',
 46: 'one',
 47: 'which',
 48: 'up',
 49: 'about',
 50: 'out',
 51: 'year',
 52: 'more',
 53: 'would',
 54: 'there',
 55: 'also',
 56: 'or',
 57: 'people',
 58: 'can',
 59: 'so',
 60: 'all',
 61: 'two',
 62: 'him',
 63: 'what',
 64: 'if',
 65: 'into',
 66: 'first',
 67: 'new',
 68: 'them',
 69: 'time',
 70: 'just',
 71: 'over',
 72: 'no',
 73: 'police',
 74: 'last',
 75: 'than',
 76: 'told',
 77: 'mr',
 78: 'my',
 79: 'years',
 80: 'before',
 81: 'could',
 82: 'being',