In [None]:
# !pip install numpy requests nlpaug

In [None]:
# Install libraries
import pandas as pd
import numpy as np
import nlpaug.augmenter.word as nlpaw
from sklearn.model_selection import train_test_split
import tqdm as tqdm


# Load the data
train_valid = pd.read_csv('data_input/train.csv')
test = pd.read_csv('data_input/test.csv')
test_labels = pd.read_csv('data_input/test_labels.csv')

# Check data
print('Our (training + valid) data has ', train_valid.shape[0], ' rows.')
print('Our test data has ', test.shape[0], ' rows.')
print('Our test label data has ', test_labels.shape[0], ' rows.')



# Allow us to see full text (not truncated)
pd.set_option('display.max_colwidth', None)

## Helper Functions

In [None]:
def analyze_dist(df):
    print('Total rows:           ', df.shape[0])
    print('Clean texts:          ', df.shape[0] - df['isToxic'].sum())
    print('Toxic texts:          ', df['isToxic'].sum())
    print('Toxic texts make up   ', ((df['isToxic'].sum() / df.shape[0])*100).round(2), 'percent of our total data')
    return

In [None]:
def combine_toxic_classes(df):
    """""""""
    Reconfigures the Jigsaw Toxic Comment dataset from a 
    multi-label classification problem to a
    binary classification problem predicting if a text is
    toxic (class=1) or non-toxic (class=0).
    
    Input:
        - df:  A pandas DataFrame with columns:
               - 'id'
               - 'comment_text'
               - 'toxic'
               - 'severe_toxic'
               - 'obscene'
               - 'threat'
               - 'insult'
               - 'identity_hate'
    Output:
        - df:  A modified pandas DataFrame with columns:
               - 'comment_text' containing strings of text.
               - 'isToxic' binary target variable containing 0's and 1's.
    """""""""
    # Create a binary classification label for 'isToxic'
    # and drop miscellaneous labels.
    df['isToxic'] = (df['toxic']==1)
    drop_cols = ['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    df.drop(columns=drop_cols, inplace=True)
    df.replace(to_replace={'isToxic': {True:1, False:0}}, inplace=True)
    
    # Cast column values to save memory
    df['isToxic'] = df['isToxic'].astype('int8')
    
    return df

In [None]:
def undersample_majority(df, percent_conserve):
    """""""""
    Undersamples the majority class ('isToxic'==0) by conserving
    a given percent as specified by the user.
    
    Inputs:
        - df:  A pandas DataFrame with columns:
               - 'comment_text' containing strings of text.
               - 'isToxic' binary target variable containing 0's and 1's. 
        - percent_conserve:  Float representing fraction of 
                             majority class (clean_texts) to conserve
    Outputs:
        - downsampled_df:    A new pandas DataFrame that has been shuffled
                             and has had its majority class downsampled.
    """""""""
    # Get rows of clean and toxic texts
    clean_texts = df[df['isToxic']==0]
    toxic_texts = df[df['isToxic']==1]
    
    # Randomly sample from the majority class and construct a new DataFrame
    # consisting of the majority class (clean_texts) + the minority classes (toxic_texts)
    to_conserve = clean_texts.sample(frac=percent_conserve, random_state=42)
    downsampled_df = to_conserve.append(toxic_texts, ignore_index=True)
    
    return downsampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
def augment_sentence(sentence, aug, num_threads):
    """""""""
    Constructs a new sentence via text augmentation.
    
    Input:
        - sentence:     A string of text
        - aug:          An augmentation object defined by the nlpaug library
        - num_threads:  Integer controlling the number of threads to use if
                        augmenting text via CPU
    Output:
        - A string of text that been augmented
    """""""""
    return aug.augment(sentence, num_thread=num_threads)
    


def augment_text(df, aug, num_threads, num_times):
    """""""""
    Takes a pandas DataFrame and augments its text data.
    
    Input:
        - df:            A pandas DataFrame containing the columns:
                                - 'comment_text' containing strings of text to augment.
                                - 'isToxic' binary target variable containing 0's and 1's.
        - aug:           Augmentation object defined by the nlpaug library.
        - num_threads:   Integer controlling number of threads to use if augmenting
                         text via CPU
        - num_times:     Integer representing the number of times to augment text.
    Output:
        - df:            The same pandas DataFrame with augmented data appended to it
                         and with rows randomly shuffled.
    """""""""
    # Get rows of data to augment
    to_augment = df[df['isToxic']==1]
    to_augmentX = to_augment['comment_text']
    to_augmentY = np.ones(len(to_augmentX.index) * num_times, dtype=np.int8)
    
    # Build up dictionary containing augmented data
    aug_dict = {'comment_text':[], 'isToxic':to_augmentY}
    for i in tqdm.tqdm(range(num_times)):
        augX = [augment_sentence(x, aug, num_threads) for x in to_augmentX]
        aug_dict['comment_text'].extend(augX)
    
    # Build DataFrame containing augmented data
    aug_df = pd.DataFrame.from_dict(aug_dict)
    
    return df.append(aug_df, ignore_index=True).sample(frac=1, random_state=42)

## Generate Training and Validation Datasets

In [None]:
# Convert from multi-label --> binary classification
train_valid = combine_toxic_classes(train_valid)

# Undersample majority class (class=0)
unbalanced_df = undersample_majority(train_valid, .42)

# Upsample minority class (class=1) to create a roughly 50-50 class distribution
# aug5p = nlpaw.ContextualWordEmbsAug(model_path='bert-base-uncased', aug_min=1, aug_p=0.05, action="substitute")
balanced_df = pd.read_csv('data_input/downsample_aug.csv', index_col=0)
# balanced_df = augment_text(downsampled_df, aug5p, 8, 3)

# Generate 80-20 train-validation splits
X_train, X_valid, y_train, y_valid = train_test_split(unbalanced_df['comment_text'],
                                                    unbalanced_df['isToxic'],
                                                    train_size=0.8,
                                                    stratify=unbalanced_df['isToxic'],
                                                    shuffle=True,
                                                    random_state=42)

X_train_aug, X_valid_aug, y_train_aug, y_valid_aug = train_test_split(balanced_df['comment_text'],
                                                                    balanced_df['isToxic'],
                                                                    train_size=0.8,
                                                                    stratify=balanced_df['isToxic'],
                                                                    shuffle=True,
                                                                    random_state=42)

# Output unbalanced data
X_train.to_csv('data_output/unbalanced_splits/X_train.csv', index=False)
X_valid.to_csv('data_output/unbalanced_splits/X_valid.csv', index=False)
y_train.to_csv('data_output/unbalanced_splits/y_train.csv', index=False)
y_valid.to_csv('data_output/unbalanced_splits/y_valid.csv', index=False)

# Output balanced data
X_train_aug.to_csv('data_output/balanced_splits/X_train_aug.csv', index=False)
X_valid_aug.to_csv('data_output/balanced_splits/X_valid_aug.csv', index=False)
y_train_aug.to_csv('data_output/balanced_splits/y_train_aug.csv', index=False)
y_valid_aug.to_csv('data_output/balanced_splits/y_valid_aug.csv', index=False)



## Generate Test Dataset

In [None]:
# Read in the data
test_labels = pd.read_csv('data_input/test_labels.csv')
test = pd.read_csv('data_input/test.csv')

# Merge test text data with its labels
test_merged = test.merge(test_labels, on='id', how='left')

# Remove masked rows that will not be tested
test_merged = test_merged[test_merged['toxic'] != -1]

# Convert from multi-label --> binary classification 
test_merged = combine_toxic_classes(test_merged)

# Output data
test_merged.to_csv('data_output/test_merged.csv', index=False)