In [8]:
import pandas as pd 
from data_loader import GoEmotionsProcessor 
import numpy as np 

In [None]:
# read training data
args = {"train_file": "train.tsv"}
processer = GoEmotionsProcessor(args = args)

examples = processer.get_examples("train")
examples

# craete a balanced dataset

In [25]:
eda_csv_path = 'train_EDA_augmented.tsv'
bert_csv_path = 'train_bertEmbed_augmented.tsv'
prot_csv_path = 'train_Prot_augmented.tsv'
original_csv_path = "train.tsv"

eda_train = pd.read_csv(eda_csv_path, sep='\t', header=None)
bert_train = pd.read_csv(bert_csv_path, sep='\t', header=None)
prot_train = pd.read_csv(prot_csv_path, sep='\t', header=None)
original_train = pd.read_csv(original_csv_path, sep='\t', header=None)

In [59]:
valid_labels = [str(i) for i in range(28)]
temp = original_train[original_train[1].isin(valid_labels)]
counts = temp[1].value_counts()


In [41]:
underperform_labels = ["16", "21", "19", "23"]

## eda augmented for underperform_labels

In [None]:
# remove the original train
original_length = len(original_train)
eda_length = len(eda_train)
eda_train_augmented = eda_train.iloc[:(eda_length - original_length), :]

eda_augmented_underperform = eda_train_augmented[eda_train_augmented[1].isin(underperform_labels)]
eda_augmented_underperform

In [71]:
# save eda underperform augmented with original
eda_underperformAug_train = pd.concat([original_train, eda_augmented_underperform], axis=0, ignore_index=True)
eda_underperformAug_train = eda_underperformAug_train.drop(columns=[2])
eda_underperformAug_train.to_csv('train_underperform_EDAaugmented.tsv', sep="\t", encoding='utf-8', index=False, header=None)


## bert embeddings for underperform_labels

In [None]:
# remove the original train
original_length = len(original_train)
bert_length = len(bert_train)
bert_train_augmented = bert_train.iloc[:(bert_length - original_length), :]

bert_augmented_underperform = bert_train_augmented[bert_train_augmented[1].isin(underperform_labels)]
bert_augmented_underperform


In [73]:
# save bert embedding underperform augmented with original
bert_underperformAug_train = pd.concat([original_train, bert_augmented_underperform], axis=0, ignore_index=True)
bert_underperformAug_train = bert_underperformAug_train.drop(columns=[2])
bert_underperformAug_train.to_csv('train_underperform_BERTaugmented.tsv', sep="\t", encoding='utf-8', index=False, header=None)


## ProAug for underperform labels

In [None]:
# remove the original train
original_length = len(original_train)
prot_length = len(prot_train)
prot_train_augmented = prot_train.iloc[:(prot_length - original_length), :]

prot_augmented_underperform = prot_train_augmented[prot_train_augmented[1].isin(underperform_labels)]
prot_augmented_underperform

In [75]:
# save bert embedding underperform augmented with original
prot_underperformAug_train = pd.concat([original_train, prot_augmented_underperform], axis=0, ignore_index=True)
prot_underperformAug_train = prot_underperformAug_train.drop(columns=[2])
prot_underperformAug_train.to_csv('train_underperform_PROTaugmented.tsv', sep="\t", encoding='utf-8', index=False, header=None)


## concat original train and 3 underperform augmented

In [57]:
final_train = pd.concat([original_train, eda_augmented_underperform, bert_augmented_underperform, prot_augmented_underperform], axis=0, ignore_index=True)
final_train = final_train.drop(columns=[2])
final_train.to_csv('train_underperform_augmented.tsv', sep="\t", encoding='utf-8', index=False, header=None)


## concat original train and bertAug and ProtAug

In [78]:
two_train = pd.concat([original_train, bert_augmented_underperform, prot_augmented_underperform], axis=0, ignore_index=True)
two_train = two_train.drop(columns=[2])
two_train.to_csv('bert_prot_augmented.tsv', sep="\t", encoding='utf-8', index=False, header=None)

In [None]:
two_train.shape

# plot the label distributions

## label distribution of orignal dataset

In [None]:
d = {}
key = 0
with open("labels.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        d[str(key)] = line.strip()
        key += 1

print(d)

In [None]:
valid_labels = [str(i) for i in range(28)]
temp = original_train[original_train[1].isin(valid_labels)]
original_counts = temp[1].value_counts()
original_counts = original_counts.rename(index=d)
original_counts.std()

In [None]:
import matplotlib.pyplot as plt 

# Plotting
plt.style.use('ggplot')
plt.figure(figsize=(10, 6))

bars = plt.bar(original_counts.index[1:], original_counts.values[1:], color='skyblue', width=0.7, alpha=0.7)
for bar in bars[-4:]:
    bar.set_color('red')

plt.xlabel('Emotion Category', fontsize=10, fontweight='bold')
plt.xticks(rotation=90)
plt.ylabel('Number of Sample', fontsize=10, fontweight='bold')
plt.title('Data Distribution in Original Training Set')

plt.grid(True, linestyle='--', alpha=0.8)
plt.gca().set_facecolor('whitesmoke')
plt.savefig('report/fig/distribution_original.png', bbox_inches='tight', dpi=300)

plt.show()


## label distribution for full augmented dataset

In [None]:
fullaug_train = pd.read_csv("train_Prot_augmented.tsv", sep='\t', header=None)
fullaug_train

In [None]:
temp = fullaug_train[fullaug_train[1].isin(valid_labels)]
fullaug_counts = temp[1].value_counts()
fullaug_counts = fullaug_counts.rename(index=d)
fullaug_counts.std()

In [None]:
# Plotting
plt.style.use('ggplot')
plt.figure(figsize=(10, 6))

bars = plt.bar(fullaug_counts.index[1:], fullaug_counts.values[1:], color='skyblue', width=0.7, alpha=0.7)
for bar in bars[-4:]:
    bar.set_color('red')

plt.xlabel('Emotion Category', fontsize=10, fontweight='bold')
plt.xticks(rotation=90)
plt.ylabel('Number of Sample', fontsize=10, fontweight='bold')
plt.title('Data Distribution in Fully Augmented Training Set')

plt.grid(True, linestyle='--', alpha=0.8)
plt.gca().set_facecolor('whitesmoke')
plt.savefig('report/fig/distribution_fullaug.png', bbox_inches='tight', dpi=300)

plt.show()

## label distribution for minority augmented dataset

In [None]:
underperformAug_train = pd.read_csv("train_underperform_PROTaugmented.tsv", sep='\t', header=None)
underperformAug_train

In [None]:
temp = underperformAug_train[underperformAug_train[1].isin(valid_labels)]
underperformAug_counts = temp[1].value_counts()
underperformAug_counts = underperformAug_counts.rename(index=d)
underperformAug_counts

In [None]:
# Plotting
plt.style.use('ggplot')
plt.figure(figsize=(10, 6))

bars = plt.bar(underperformAug_counts.index[1:], underperformAug_counts.values[1:], color='skyblue', width=0.7, alpha=0.7)
for id in [-2, -8, -3, -10]:
    bars[id].set_color('red')

plt.xlabel('Emotion Category', fontsize=10, fontweight='bold')
plt.xticks(rotation=90)
plt.ylabel('Number of Sample', fontsize=10, fontweight='bold')
plt.title('Data Distribution in Minority Augmented Training Set')

plt.grid(True, linestyle='--', alpha=0.8)
plt.gca().set_facecolor('whitesmoke')
plt.savefig('report/fig/distribution_minority.png', bbox_inches='tight', dpi=300)

plt.show()