In [None]:
# !pip install numpy requests nlpaug

## Initial Setup

In [None]:
# Install libraries
import pandas as pd
import numpy as np
import nlpaug.augmenter.word as nlpaw
from sklearn.model_selection import train_test_split
import tqdm as tqdm

# Import utility functions
from src.utils.data_utils import analyze_dist
from src.utils.data_utils import augment_sentence
from src.utils.data_utils import augment_text
from src.utils.data_utils import combine_toxic_classes
from src.utils.data_utils import get_relevant_words
from src.utils.data_utils import undersample_majority



# Load the data
train_valid = pd.read_csv('data/raw/train.csv')
test = pd.read_csv('data/raw/test.csv')
test_labels = pd.read_csv('data/raw/test_labels.csv')

# Check data
print('Our (training + valid) data has ', train_valid.shape[0], ' rows.')
print('Our test data has ', test.shape[0], ' rows.')
print('Our test label data has ', test_labels.shape[0], ' rows.')

# Allow us to see full text (not truncated)
pd.set_option('display.max_colwidth', None)

## Generate 'Unbalanced' Dataset

In [None]:
# Convert from multi-label --> binary classification
train_valid = combine_toxic_classes(train_valid)

# Undersample majority class (class=0)
unbalanced_df = undersample_majority(train_valid, .42)

# Generate 80-20 train-validation splits
X_train, X_valid, y_train, y_valid = train_test_split(unbalanced_df['comment_text'],
                                                      unbalanced_df['isToxic'],
                                                      train_size=0.8,
                                                      stratify=unbalanced_df['isToxic'],
                                                      shuffle=True,
                                                      random_state=42)

# Output splits of unbalanced dataset
X_train.to_csv('data/processed/unbalanced_dataset/X_train.csv', index=False)
X_valid.to_csv('data/processed/unbalanced_dataset/X_valid.csv', index=False)
y_train.to_csv('data/processed/unbalanced_dataset/y_train.csv', index=False)
y_valid.to_csv('data/processed/unbalanced_dataset/y_valid.csv', index=False)

## Generate 'Balanced' Dataset

In [None]:
# Get the training dataset whose data we will be augmenting
to_aug = pd.concat([X_train, y_train], axis=1)

# Select the first 128 words of text (the maximum token length we will be using)
# so that augmentation is only applied to these words.
to_aug['comment_text'].apply(lambda text: get_relevant_words(text, 128))

# Define nlpaug augmentation object 
aug10p = nlpaw.ContextualWordEmbsAug(model_path='bert-base-uncased', aug_min=1, aug_p=0.1, action="substitute")

# Upsample minority class ('isToxic'==1) to create a roughly 50-50 class distribution
balanced_df = augment_text(to_aug, aug10p, 8, 3)

# Get splits for Balanced Dataset
X_train_aug = balanced_df['comment_text']
X_valid_aug = X_valid
y_train_aug = balanced_df['isToxic']
y_valid_aug = y_valid

# Output balanced data
X_train_aug.to_csv('data/processed/balanced_dataset/X_train_aug.csv', index=False)
X_valid_aug.to_csv('data/processed/balanced_dataset/X_valid_aug.csv', index=False)
y_train_aug.to_csv('data/processed/balanced_dataset/y_train_aug.csv', index=False)
y_valid_aug.to_csv('data/processed/balanced_dataset/y_valid_aug.csv', index=False)

## Generate Test Dataset

In [None]:
# Merge test text data with its labels
test_merged = test.merge(test_labels, on='id', how='left')

# Remove masked rows that will not be tested
test_merged = test_merged[test_merged['toxic'] != -1]

# Convert from multi-label --> binary classification 
test_merged = combine_toxic_classes(test_merged)

# Output data
test_merged.to_csv('data/processed/test_merged.csv', index=False)