# Toxic Comment Classification

Data Augmentation

#### Dissertation Project
### Name: Renee Mendonca
### Student Number-221040908


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet

# Mount Google Drive
drive.mount('/content/drive')

# Ensure necessary NLTK data files are downloaded
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')

# Update the file paths to point to your Google Drive
training_data_file = "/content/drive/My Drive/toxic-comment-classification/OLIDv1/olid-training-v1.0.tsv"
labels_b_file = "/content/drive/My Drive/toxic-comment-classification/OLIDv1/labels-levelb.csv"

# Load training data
try:
    training_data = pd.read_csv(training_data_file, delimiter='\t')
    labels_b = pd.read_csv(labels_b_file, header=None, names=['id', 'label'])
except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    exit(1)

# Print the first few rows to verify
print("Training data head:")
print(training_data.head())
print("Labels B head:")
print(labels_b.head())

# Print column names to debug the issue
print("Training data columns:")
print(training_data.columns)
print("Labels B columns:")
print(labels_b.columns)

# Merge data to get labels for Task B
training_data = training_data.rename(columns={"id": "id", "tweet": "comment"})
# Check for common IDs
print("Unique IDs in training_data:", training_data['id'].unique())
print("Unique IDs in labels_b:", labels_b['id'].unique())

merged_data = training_data.merge(labels_b, on='id', how='inner')  # or 'left', 'right', 'outer'
tin_unt_data = merged_data[merged_data['label'].isin(['TIN', 'UNT'])]

# Verify the merged data
print("Merged Data Sample:")
print(tin_unt_data.head())
print(tin_unt_data.columns)

def get_synonym(word, pos):
    """Gets a synonym for a given word and its part of speech."""
    replacements = []
    for syn in wordnet.synsets(word):
        if pos[1] == 'NNP' or pos[1] == 'DT':
            break
        word_type = pos[1][0].lower()
        if syn.name().find("." + word_type + "."):
            r = syn.name()[0:syn.name().find(".")]
            replacements.append(r)
    if len(replacements) > 0:
        replacement = replacements[np.random.randint(0, len(replacements))]
        return replacement
    else:
        return word

def replace_words(words):
    """Replaces random words in a list with their synonyms."""
    tagged = nltk.pos_tag(words)
    output = ""
    for i in range(0, len(words)):
        replacements = []
        for syn in wordnet.synsets(words[i]):
            if tagged[i][1] == 'NNP' or tagged[i][1] == 'DT':
                break # Indentation fixed here
            word_type = tagged[i][1][0].lower()
            if syn.name().find("." + word_type + "."):
                r = syn.name()[0:syn.name().find(".")]
                replacements.append(r)
        if len(replacements) > 0:
            replacement = replacements[np.random.randint(0, len(replacements))]
            output += " " + replacement
        else:
            output += " " + words[i]
    return [output]

def replace_random_words(list):
    """Replaces random words in a list with their synonyms."""
    tagged = nltk.pos_tag(list)
    num_words_to_replace = int(np.floor(0.5 * len(list)))
    random_indices = generate_random_word_indices(num_words_to_replace, len(list))
    new_comments = []
    for i in range(1, num_words_to_replace + 1):
        words_to_replace = random_indices[0:i]
        new_comment = []
        for j in range(0, len(list)):
            if j not in words_to_replace:
                new_comment.append(list[j])
            else:
                new_comment.append(get_synonym(list[j], tagged[j]))
        new_comments.append(' '.join(new_comment))
    return new_comments

def unique_comment(list):
    """Removes duplicate words from a list."""
    return list(set(list))

def generate_random_word_indices(num_words_to_remove, comment_length):
    """Generates random indices for word removal or replacement."""
    random_indices = []
    for _ in range(num_words_to_remove):
        random_index = np.random.randint(low=0, high=comment_length)
        while random_index in random_indices:
            random_index = np.random.randint(low=0, high=comment_length)
        random_indices.append(random_index)
    return random_indices

def remove_random_words(list):
    """Removes random words from a list."""
    num_words_to_remove = int(np.floor(0.2 * len(list)))
    random_indices = generate_random_word_indices(num_words_to_remove, len(list))
    new_comments = []
    for i in range(1, num_words_to_remove + 1):
        words_to_remove = random_indices[0:i]
        new_comment = []
        for j in range(0, len(list)):
            if j not in words_to_remove:
                new_comment.append(list[j])
        new_comments.append(' '.join(new_comment))
    return new_comments

# Augment data
new_training_comments = []
new_training_labels = []

for i in range(len(tin_unt_data)):
    comment = tin_unt_data.iloc[i]['comment']
    label = tin_unt_data.iloc[i]['label']
    comment_list = [word for word in comment.split()]
    comment_list = list(filter(None, comment_list))
    unique_comment_list = unique_comment(comment_list)
    new_comments_after_removal = remove_random_words(comment_list)
    new_comments_after_replacement = replace_random_words(comment_list)

    new_comments_after_removal.append(comment)
    new_comments_after_removal.append(' '.join(unique_comment_list))

    # Correctly append to lists
    new_training_comments.extend(new_comments_after_removal)
    new_training_labels.extend([label] * len(new_comments_after_removal))

    new_training_comments.extend(new_comments_after_replacement)
    new_training_labels.extend([label] * len(new_comments_after_replacement))

new_training_comments = np.array([new_training_comments]).transpose()
new_training_labels = np.array(new_training_labels)

training_data_augmented = np.append(new_training_comments.reshape((-1, 1)), new_training_labels.reshape((-1, 1)), axis=1)

# Print some of the augmented data before saving
print("Sample augmented data:")
print(training_data_augmented[:5])

# Save augmented data
output_file = "/content/drive/My Drive/toxic-comment-classification/OLIDv1/training-data-augmented-all-task-b.csv"
np.savetxt(output_file, training_data_augmented, delimiter=',', fmt='%s')
print(f"Augmented data saved to {output_file}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training data head:
      id                                              tweet subtask_a  \
0  86426  @USER She should ask a few native Americans wh...       OFF   
1  90194  @USER @USER Go home you’re drunk!!! @USER #MAG...       OFF   
2  16820  Amazon is investigating Chinese employees who ...       NOT   
3  62688  @USER Someone should'veTaken" this piece of sh...       OFF   
4  43605  @USER @USER Obama wanted liberals &amp; illega...       NOT   

  subtask_b subtask_c  
0       UNT       NaN  
1       TIN       IND  
2       NaN       NaN  
3       UNT       NaN  
4       NaN       NaN  
Labels B head:
      id label
0  15923   TIN
1  60133   TIN
2  83681   TIN
3  65507   TIN
4  12588   UNT
Training data columns:
Index(['id', 'tweet', 'subtask_a', 'subtask_b', 'subtask_c'], dtype='object')
Labels B columns:
Index(['id', 'label'], dtype='object')
Uniqu

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Load the Augmented Data


In [None]:
# Check the content of the CSV file
!head /content/drive/My\ Drive/toxic-comment-classification/OLIDv1/training-data-augmented-all-task-b.csv


In [None]:
from nltk.corpus import wordnet
import pandas as pd
import numpy as np
import string
import nltk

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')

# Load data using pandas
data = pd.read_csv('/content/drive/My Drive/toxic-comment-classification/OLIDv1/training-data-augmented-all-task-b.csv', header=None)

# Convert to numpy array if needed
training_data = data.values

# Check the shape of the array
print(training_data.shape)

# Access data based on the shape
if training_data.ndim == 2:
    list_sentences_training = training_data[:, 1]  # Assuming the second column is the text
else:
    # Handle 1D array case or reshape if necessary
    pass
# Get word Synonym
def get_synonym(word, pos):

    replacements = []
    # Only replace nouns with nouns, vowels with vowels etc.
    for syn in wordnet.synsets(word):

        # Do not attempt to replace proper nouns or determiners
        if pos[1] == 'NNP' or pos[1] == 'DT':
            break

        # The tokenizer returns strings like NNP, VBP etc
        # but the wordnet synonyms has tags like .n.
        # So we extract the first character from NNP ie n
        # then we check if the dictionary word has a .n. or not
        word_type = pos[1][0].lower()
        if syn.name().find("." + word_type + "."):
            # extract the word only
            r = syn.name()[0:syn.name().find(".")]
            replacements.append(r)

    if len(replacements) > 0:
        # Choose a random replacement
#         print(word, 'replacemeents', replacements)
        replacement = replacements[np.random.randint(0, len(replacements))]
        return replacement
    else:
        # If no replacement could be found, then just use the
        # original word
        return word

# Replace Random Words with their synonyms
def replace_words(words):
  # Identify the parts of speech
  tagged = nltk.pos_tag(words)

  output = ""

  for i in range(0, len(words)):
      replacements = []

      # Only replace nouns with nouns, vowels with vowels etc.
      for syn in wordnet.synsets(words[i]):

          # Do not attempt to replace proper nouns or determiners
          if tagged[i][1] == 'NNP' or tagged[i][1] == 'DT':
              break

          # The tokenizer returns strings like NNP, VBP etc
          # but the wordnet synonyms has tags like .n.
          # So we extract the first character from NNP ie n
          # then we check if the dictionary word has a .n. or not
          word_type = tagged[i][1][0].lower()
          if syn.name().find("." + word_type + "."):
              # extract the word only
              r = syn.name()[0:syn.name().find(".")]
              replacements.append(r)

      if len(replacements) > 0:
          # Choose a random replacement
          replacement = replacements[np.random.randint(0, len(replacements))]
          output = output + " " + replacement
      else:
          # If no replacement could be found, then just use the
          # original word
          output = output + " " + words[i]

      return [output]

def replace_random_words(list):
  # Identify the parts of speech
  tagged = nltk.pos_tag(list)

  num_words_to_replace = int(np.floor(0.5 * len(list)))
  # print(len(list))
  # print(num_words_to_remove)
  random_indices = generate_random_word_indices(num_words_to_replace, len(list))
  # print(random_indices)
#     new_comments = [' '.join(list)]
  new_comments = []
  for i in range(1, num_words_to_replace + 1):
      words_to_replace = random_indices[0:i]
      new_comment = []
      for j in range(0, len(list)):
          if j not in words_to_replace:
              new_comment.append(list[j])
          else:
              new_comment.append(get_synonym(list[j], tagged[j]))
      # print(new_comment)
      new_comments.append(' '.join(new_comment))

  return new_comments


# Remove repeated words
def unique_comment(list):
    ulist = []
    [ulist.append(x) for x in list if x not in ulist]
    return ulist


# generate words indices to remove or replace
def generate_random_word_indices(num_words_to_remove, comment_length):
    random_indices = []
    # print('words to remove = ', num_words_to_remove)
    for i in range(0, num_words_to_remove):
        random_index = np.random.randint(low=0, high=comment_length)
        while random_index in random_indices:
#             print('random index', random_index)
#             print('random indices', random_indices)
            random_index = np.random.randint(low=0, high=comment_length)

        random_indices.append(random_index)

    return random_indices


# Remove Random Words
def remove_random_words(list):
    num_words_to_remove = int(np.floor(0.2 * len(list)))
    # print(len(list))
    # print(num_words_to_remove)
    random_indices = generate_random_word_indices(num_words_to_remove, len(list))
    # print(random_indices)
#     new_comments = [' '.join(list)]
    new_comments = []
    for i in range(1, num_words_to_remove + 1):
        words_to_remove = random_indices[0:i]
        new_comment = []
        for j in range(0, len(list)):
            if j not in words_to_remove:
                new_comment.append(list[j])
        # print(new_comment)
        new_comments.append(' '.join(new_comment))

    return new_comments


# Severe Toxic, Threat and Identity Hate are the ones with small number of records
training_data = np.loadtxt('/content/drive/My Drive/toxic-comment-classification/OLIDv1/training-data-augmented-all-task-b.csv', delimiter=',', dtype=str)

# Get the comment part of the data
list_sentences_training = training_data[:,1]

# Get the labels part of the data
training_labels = training_data[:,3]

new_training_comments = []
new_training_labels = []
print(list_sentences_training.shape[0])

new_comments = 0
for i in range(0, list_sentences_training.shape[0]):
    if training_labels[i] == 'UNT':
        # comment_list = [word.strip(string.punctuation) for word in list_sentences_training[i].split()]
        comment_list = [word for word in list_sentences_training[i].split()]

        # remove empty strings
        comment_list = list(filter(None, comment_list))
        unique_comment_list = unique_comment(comment_list)
        new_comments_after_removal = remove_random_words(comment_list)
        new_comments_after_replacement = replace_random_words(comment_list)

        new_comments_after_removal.append(list_sentences_training[i])
        new_comments_after_removal.append(' '.join(unique_comment_list))

        [new_training_comments.append(new_comment) for new_comment in new_comments_after_removal]
        [new_training_labels.append(training_labels[i]) for new_comment in new_comments_after_removal]

        [new_training_comments.append(new_comment) for new_comment in new_comments_after_replacement]
        [new_training_labels.append(training_labels[i]) for new_comment in new_comments_after_replacement]
        new_comments += len(new_comments_after_removal) + len(new_comments_after_replacement) - 1

    elif training_labels[i] == 'TIN':
        new_training_comments.append(list_sentences_training[i])
        new_training_labels.append(training_labels[i])

new_training_comments = np.array([new_training_comments]).transpose()
print(new_training_comments.shape)

new_training_labels = np.array(new_training_labels)
print(new_training_labels.shape)
print(new_training_labels[5])

training_data_augemented = np.append(new_training_comments.reshape((-1, 1)), new_training_labels.reshape((-1, 1)), axis=1)
print('shape of augemented training data', training_data_augemented.shape)
print(training_data_augemented[0])
print(training_data_augemented[1])

print('number of newly added comments', new_comments)
np.savetxt('/content/drive/My Drive/toxic-comment-classification/OLIDv1/training-data-augmented-all-task-b.csv', training_data_augemented, delimiter=',', fmt='%s')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


EmptyDataError: No columns to parse from file