In [None]:
#!pip install --upgrade numpy --user
#!pip install torch>=1.2.0 transformers>=2.5.0 --user
#!pip install nlpaug numpy matplotlib python-dotenv --user

In [1]:
import os
import re
import sys
import nltk
import glob
import math
import random
import gensim
import numpy as np
import pandas as pd
nltk.download('punkt')
nltk.download('wordnet')
from gensim import corpora
import matplotlib.pyplot as plt
from nltk.corpus import wordnet
import gensim.downloader as api
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import nlpaug.augmenter.word as naw
from nltk.tokenize import word_tokenize
from nltk.cluster.util import cosine_distance
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to C:\Users\Shrikanth
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Shrikanth
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Augmentation Operations  - SECTION 4.4.3
* Random Noise Injection - Random Swap, Random Insertion, Random Deletion.
* Synonym Replacement
* Similar Word Replacement
* Context Word Insertion

In [None]:
def get_only_chars(line):

    clean_line = ""
    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line


# Augmentation parameter Î± that define how much augmentation should be performed per sentence in the document.

alpha_sr=0.2
alpha_ri=0.2
alpha_rs=0.2
alpha_swr=0.2

########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from wordnet
# Finds  the  synonym  of n random  words  and  insert  them  at random position in the sentence
# Details about sysnet - https://www.nltk.org/howto/wordnet.html
########################################################################
def synonym_replacement(words):
    num_words = len(words)   
    n = max(1, int(alpha_sr*num_words))
    new_words = words.copy()
    num_replaced = 0
    for random_word in new_words:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n: 
            break

    sentence = ' '.join(new_words)
    text_augmentation_pipeline.Augmented_sentences.append(sentence)

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)


########################################################################
# Random swap
# Randomly swap two words in the sentence n times
# Two words are chosen from a sentence randomly and their positions are swapped
########################################################################

def random_swap(words):
    num_words = len(words)
    n = max(1, int(alpha_rs*num_words))
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
  
    sentence = ' '.join(new_words)
    text_augmentation_pipeline.Augmented_sentences.append(sentence)


def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
    return new_words

########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################

def random_insertion(words):
    num_words = len(words)
    n = max(1, int(alpha_ri*num_words))
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    sentence = ' '.join(new_words)
    text_augmentation_pipeline.Augmented_sentences.append(sentence)

def add_word(new_words):
    synonyms = []
    counter = 0
    while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return
    random_synonym = synonyms[0]
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

    
#########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
#########################################################################

def random_deletion(words, p=0.1):
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)
    sentence = ' '.join(new_words)
    text_augmentation_pipeline.Augmented_sentences.append(sentence)

    
#################################################################################
# Replacement by most similar words
# Choose a random word in the sentence and replace it with the most similar word
#################################################################################

def similar_word_replacement(words):
    alpha_swr = 0.2
    num_words = len(words)
    n = max(1, int(alpha_swr*num_words))
    new_words = words.copy()
    num_replaced = 0
    for random_word in new_words:
        try:
            most_similar_words = [similar_word[0] for similar_word in Word2Vec_model .wv.most_similar(positive=[random_word])]
            filtered_similar_words = list(filter(lambda x: len(x) > 3, most_similar_words))
            if len(filtered_similar_words) > 0:
                most_similar_word = filtered_similar_words[0]
                new_words = [most_similar_word if word == random_word else word for word in new_words]
                num_replaced += 1
                if num_replaced >= n: #only replace up to n words
                    break
        except KeyError:
            pass

    sentence = ' '.join(new_words)
    text_augmentation_pipeline.Augmented_sentences.append(sentence)

#####################################################################
# Transformer based model BERT trained on large amount of text data 
# by one of the pretraining strategies called Masked Language using 
# which certain portions of text can be masked and then predicted.
######################################################################
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")
def context_word_substitute(words):
    augmented_text = aug.augment(words)
    Augmented_sentences.append(augmented_text)

#### Implementation of equation - 4.8 - Number of Augmentation Set Calculation


In [1]:
def aug_set_calculator(Sumy_csv):
    Sumy_Label_df = Sumy_csv['Labels'].to_frame()
    Sumy_Label_df = Sumy_Label_df.groupby(Sumy_Label_df.columns.tolist()).size().reset_index().rename(columns={0:'Records'})
    Ni_max = Sumy_Label_df.Records.max()
    n_aug = []
    for idx, row in Sumy_Label_df.iterrows():
        n_aug.append(math.floor((Ni_max / row['Records']) - 1))
    
    Sumy_Label_df['n_aug'] = n_aug
    return Sumy_Label_df

#### Implementation of figure 4.5: Text Data Augmentation Pipeline

In [None]:
def text_augmentation_pipeline(df, type_df):
    '''
    param df: Input dataframe for which augmentation have to be performed.
    param type_df: Choice supplied w.r.t value of n_aug. If n_aug = 0 then type_df == 'WAS' else type_df== 'WoAS'.
    '''
    Df_Augmented_files = pd.DataFrame()
    count = 1
    for idx,row in df.iterrows():
        print('Completed for', count)
        count = count + 1
        
        if type_df == 'WAS': # With Augmented set
            num_augmented_set = Sumy_Label_df.loc[Sumy_Label_df['Labels'] == row['Labels'], 'n_aug'].iloc[0]
        elif type_df== 'WoAS': #Without Augmented set
            num_augmented_set = 1
        
        tokenized_text = []
        list_sentences=[]
        sent_text = nltk.sent_tokenize(row['Summarized_content']) 
        for sentence in sent_text:
            clean_sentence = get_only_chars(sentence)
            tokenized_text.append(nltk.word_tokenize(clean_sentence))
        print(num_augmented_set)
        for i in range(num_augmented_set):
            text_augmentation_pipeline.Augmented_sentences = []
            options = [1, 2, 3, 4, 5]
            for sent in range(len(tokenized_text)):
                rand_choice = random.choice(options)
            
                if rand_choice==1:                
                    synonym_replacement(tokenized_text[sent])

                elif rand_choice==2:                    
                    random_swap(tokenized_text[sent])

                elif rand_choice==3:                    
                    random_insertion(tokenized_text[sent])

                elif rand_choice==4:                    
                    random_deletion(tokenized_text[sent])

                elif rand_choice==5:                    
                    similar_word_replacement(tokenized_text[sent])
    
            Augmented_data_with_LegalDetails = [{'Summarized_content':' '.join(text_augmentation_pipeline.Augmented_sentences), 
                                                 'Labels':np.int64(row['Labels']), 'DocID':row['DocID'],
                                                 'Legal_Details':row['Legal_Details'], 'Filename':row['Filename']}]
            Df_Augmented_files_dict = pd.DataFrame.from_dict(Augmented_data_with_LegalDetails)
            Df_Augmented_files=Df_Augmented_files.append(Df_Augmented_files_dict, ignore_index=True, sort=False)

    return Df_Augmented_files

#### Implementation of equation - 4.9 

In [None]:
def partial_augmentation(Sumy_csv, Sumy_Label_df):
    augmented_files_II = pd.DataFrame()
    
    for _label in range(1,16):
        Truncated_data = Sumy_csv[Sumy_csv['Labels'] == _label]
        Ni_max = Sumy_Label_df.Records.max()
        fraction = (Ni_max/len(Truncated_data))-1
        Part_truncated_data = Truncated_data.sample(frac=fraction, random_state=rng)
        Df_Augmented_files_II = text_augmentation_pipeline(Part_truncated_data, 'WoAS')
        augmented_files_II=augmented_files_II.append(Df_Augmented_files_II, ignore_index=True, sort=False)
        
    return augmented_files_II

In [None]:
def dataset_dist_plot(df):
    data_dict = dict(df['Labels'].value_counts())
    plt.rcParams["figure.figsize"] = (25,10)
    plt.rcParams.update({'font.size': 22})
    bars = plt.bar(range(len(data_dict)), list(data_dict.values()), align='center', color = 'b', width=0.7) #list('rgbkymc')
    plt.xticks(range(len(data_dict)), list(data_dict.keys()))
    plt.xlabel("Document Classes")
    plt.ylabel("No. of samples")
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x(), yval, yval)
    plt.show()

In [None]:
if __name__ == '__main__':
    Word2Vec_model = Word2Vec.load('Thesis - Dataset and Transformations/word2vec/Word2Vec_100d.model')
    # For tfidf file: Thesis - Dataset and Transformations/transform - post summarization/LSA_tfidf.csv
    # For term freq: Thesis - Dataset and Transformations/transform - post summarization/LSA_frequency.csv
    Sumy_csv = pd.read_csv('Thesis - Dataset and Transformations/transform - post summarization/LSA_binary.csv')
    
    Sumy_Label_df = aug_set_calculator(Sumy_csv)
    augmented_files = text_augmentation_pipeline(Sumy_csv, 'WAS')
    augmented_files_II = partial_augmentation(Sumy_csv, Sumy_Label_df)
    
    total_training_files = pd.concat([augmented_files, augmented_files_II, Sumy_csv], ignore_index=True)
    # Change the saving path for each form of summarized document input.
    # For tfidf: Thesis - Dataset and Transformations/transform - post text augmentation/lsa_tfidf_augmentation.csv
    # For term freq: Thesis - Dataset and Transformations/transform - post text augmentation/lsa_tf_augmentation.csv
    total_training_files.to_csv('Thesis - Dataset and Transformations/transform - post text augmentation/lsa_binary_augmentation.csv')
    dataset_dist_plot(total_training_files)