# Sebastian Petrik - Abstractive summarization - Data preparation and preprocessing

Datasets preprocessing, train-validation-test split, output as prepared frame with special tokens, ready for tokenization and training / inference / evaluation.

## Setup

In [None]:
# seed for random generators
SEED = 42

In [None]:
!pip install openpyxl contractions --quiet

In [None]:
import os
print(os.environ.get('KAGGLE_CONTAINER_NAME')) # check if kaggle

In [None]:
import pkg_resources
sorted(list(filter(
    lambda x: x[0] in ['numpy', 'pandas', 'tensorflow', 'tensorflow-text', 'keras', 'tensorflow-estimator', 'tensorflow-datasets', 'contractions'],
    [(i.key, i.version) for i in pkg_resources.working_set]
)))

In [None]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from collections import defaultdict
import string
import tensorflow as tf
import re
import os
import time
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import operator as op
import spacy
import contractions

## Data loading

In [None]:
# show available data
!ls ../input
!ls ../input/inshorts-news-data
!ls ../input/news-summarization

In [None]:
# Inshorts XLS dataset
inshorts_raw = pd.read_excel("../input/inshorts-news-data/Inshorts Cleaned Data.xlsx",engine = 'openpyxl')
inshorts_raw.drop(['Source ', 'Time ', 'Publish Date'], axis=1, inplace=True)
inshorts_raw

In [None]:
# combined 3 datasets 4GB
news_sum_combined = pd.read_csv('../input/news-summarization/data.csv')
news_sum_combined.drop(columns=news_sum_combined.columns[0], axis=1, inplace=True) # drop index col
news_sum_combined

## Preprocessing

In [None]:
class Preprocessor:
    
    def __init__(self, dot_token_enabled=True):
        self.dot_token_enabled = dot_token_enabled
    
    # Text cleanup
    def clean_text(self, text: str):

        # lowercase
        text = str(text).lower()

        # remove &-escaped characters
        text = re.sub(r"&.[1-9]+;"," ", str(text))

        # remove escaped characters
        text=re.sub("(\\t)", ' ', str(text))
        text=re.sub("(\\r)", ' ', str(text))
        text=re.sub("(\\n)", ' ', str(text))

        # remove double characters
        text=re.sub("(__+)", ' ', str(text))  #remove _ if it occurs more than one time consecutively
        text=re.sub("(--+)", ' ', str(text))   #remove - if it occurs more than one time consecutively
        text=re.sub("(~~+)", ' ', str(text))   #remove ~ if it occurs more than one time consecutively
        text=re.sub("(\+\++)", ' ', str(text))  #remove + if it occurs more than one time consecutively
        text=re.sub("(\.\.+)", ' ', str(text))  #remove . if it occurs more than one time consecutively
        
        # fix contractions to base form
        text = contractions.fix(text)

        #remove special tokens <>()|&©ø"',;?~*!
        text=re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(text)).lower()

        # CNN mail data cleanup
        text=re.sub("(mailto:)", ' ', str(text)) #remove mailto:
        text=re.sub(r"(\\x9\d)", ' ', str(text)) #remove \x9* in text
        text=re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(text)) #replace INC nums to INC_NUM
        text=re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM', str(text)) #replace CM# and CHG# to CM_NUM

        # url replacement into base form
        try:
            url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', str(text))
            repl_url = url.group(3)
            text = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)',repl_url, str(text))
        except:
            pass


        # handle dot at the end of words
        if self.dot_token_enabled:
            text=re.sub("(\.\s+)", ' . ', str(text))
        else:
            text=re.sub("(\.\s+)", ' ', str(text)) # or completely remove
        
        text=re.sub("(\-\s+)", ' ', str(text)) #remove - at end of words(not between)
        text=re.sub("(\:\s+)", ' ', str(text)) #remove : at end of words(not between)

        #remove multiple spaces
        text=re.sub("(\s+)",' ',str(text))

        # apply lowercase again
        text = text.lower().strip()
        
        # remove trailing dot, we will apply end of sequence anyway
        text = re.sub("(\.)$", '', str(text)).strip()

        return text

    def apply_special_tokens(self, text):
        text = str(text).strip()
        text = "<sos> " + str(text).strip() + " <eos>"
        
#         if self.dot_token_enabled:
#             text = text.replace(".", "<dot>")
        
        return text

    def remove_special_tokens(self, text):
        text = text.lower()
        text = text.replace("<sos>", "").replace("<eos>", "")
        text = text.replace("<unk>", "##")
#         text = text.replace("<dot>", ". ") # normal syntax with dot at end
        text = text.strip()
        return text

    def plot_approx_lengths(self, df: pd.DataFrame):
        sns.displot(df['article_len_approx']), sns.displot(df['summary_len_approx'])

    # preprocess sentence dataframe from raw format - clean, apply sos/eos tokens
    # - removes articles with length outside bounds
    def preprocess_frame(self, df: pd.DataFrame, article_len_range, summary_len_range):
        
        print("Preprocessing frame...")

        # apply text cleaning
        df['article'] = df['article'].apply(self.clean_text)
        df['summary'] = df['summary'].apply(self.clean_text)

        # apply special tokens
        df['article'] = df['article'].apply(self.apply_special_tokens)
        df['summary'] = df['summary'].apply(self.apply_special_tokens)

        # simple text length approximation for analysis
        df['article_len_approx'] = df['article'].apply(lambda x: op.countOf(x, ' '))
        df['summary_len_approx'] = df['summary'].apply(lambda x: op.countOf(x, ' '))
        
        print("Original length distribution:")
        self.plot_approx_lengths(df)

        # remove longer than set length
        article_min, article_max = article_len_range
        summary_min, summary_max = summary_len_range
        
        df = df[
            (df['article_len_approx'] <= article_max) &
            (df['article_len_approx'] >= article_min) &
            (df['summary_len_approx'] <= summary_max) &
            (df['summary_len_approx'] >= summary_min)
        ]

        # print plots
        print("After processing length distribution:")
        self.plot_approx_lengths(df)

        return df
    
preprocessor = Preprocessor(dot_token_enabled=False)

# Test
preprocessor.clean_text("  This text (my text ) isn't ]] very - clean.  it WOULd'Ve been betteR? if, it was ok  ")

In [None]:
preprocessor.apply_special_tokens(preprocessor.clean_text(inshorts_raw.iloc[100]['Short']))

## Inshorts dataset

In [None]:
df_inshorts = preprocessor.preprocess_frame(
        inshorts_raw.rename({"Short": "article", "Headline": "summary"}, axis=1).reset_index(drop=True),
        (10, 70),
        (3, 16)
)
print(df_inshorts.describe())
df_inshorts

In [None]:
# Newssum combined -> get xsum
print('News sum available datasets:', news_sum_combined.groupby('Dataset')['Dataset'].count())

xsum_raw = news_sum_combined[ news_sum_combined['Dataset'] == 'XSum' ]

print('Preparing xsum ...')
df_xsum = preprocessor.preprocess_frame(
    xsum_raw.rename({"Content": "article", "Summary": "summary"}, axis=1).reset_index(drop=True),
    (10, 300),
    (3, 40)
)
print(df_xsum.describe())
df_xsum

In [None]:
print('Preparing xsum shorter ...')
df_xsum_shorter = preprocessor.preprocess_frame(
    xsum_raw.rename({"Content": "article", "Summary": "summary"}, axis=1).reset_index(drop=True),
    (10, 150),
    (3, 40)
)
print(df_xsum_shorter.describe())
df_xsum_shorter

## Assemble final train, validation and test sets

In [None]:
#  drop articles with only one member of length category, this will allow stratified split
def ensure_min_summary_length_group_size(df: pd.DataFrame):
    df = df[df.groupby('article_len_approx').summary_len_approx.transform('count') > 1]
    return df

# Stratified split into train, validation, test by article length
def stratified_triple_split_by_article(df: pd.DataFrame, test_size=0.1, val_size=0.1):
    df = ensure_min_summary_length_group_size(df)
    df_trainval, df_test = train_test_split(
        df,
        test_size=test_size, 
        stratify=df['article_len_approx'],
        random_state=SEED
    )
    df_trainval.shape, df_test.shape

    # split train+val to train and validation,
    # relative original full set with test set is 1.1 so we multiply the ratio to get ~|test|

    df_trainval = ensure_min_summary_length_group_size(df_trainval)
    df_train, df_val = train_test_split(
        df_trainval,
        test_size=val_size * (1 + test_size),
        stratify=df_trainval['article_len_approx'],
        random_state=SEED
    )

    return df_train, df_val, df_test

def save_dataset(name: str, val_size: int, test_size: int, frame: int):
    
    print('Saving dataset ' + name)
    
    frame = frame[['article', 'summary', 'article_len_approx', 'summary_len_approx']].reset_index(drop=True)
    
    train, val, test = df_train, df_val, df_test = stratified_triple_split_by_article(
        frame,
        test_size=test_size,
        val_size=val_size
    )
    
    print(f"Train:      {train.shape} - {1 - test_size - val_size}")
    print(f"Validation: {val.shape} - {val_size}")
    print(f"Test:       {test.shape} - {test_size}")
    
    train.to_csv(name + f"_v{val_size}_t{test_size}_train.csv")
    val.to_csv(name + f"_v{val_size}_t{test_size}_val.csv")
    test.to_csv(name + f"_v{val_size}_t{test_size}_test.csv")
    
    print('Split and saved dataset ' + name)
    

In [None]:
# save inshorts
save_dataset('inshorts_10-70_3-16', 0.1, 0.1, df_inshorts)
save_dataset('inshorts_10-70_3-16', 0.05, 0.05, df_inshorts)

In [None]:
# save xsum
save_dataset('xsum_10-300_3-40', 0.1, 0.1, df_xsum)
save_dataset('xsum_10-300_3-40', 0.05, 0.05, df_xsum)

In [None]:
# save xsum shorter
save_dataset('xsum_10-150_3-40', 0.1, 0.1, df_xsum_shorter)
save_dataset('xsum_10-150_3-40', 0.05, 0.05, df_xsum_shorter)

In [None]:
!ls

In [None]:
print("Done")