In [None]:
!wget https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/emotions.txt
!wget https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/sentiment_dict.json
!wget https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/sentiment_mapping.json
!wget https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/ekman_mapping.json
!wget https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/test.tsv
!wget https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/dev.tsv
!wget https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/train.tsv

In [None]:
import torch
import csv
import numpy as np
from torch import nn
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import optimization
from tqdm.notebook import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
train_file = "./train.tsv"
dev_file = "./dev.tsv"
test_file = "./test.tsv"
sentiment_mapping_file = "./sentiment_mapping.json"
emotions_file = "./emotions.txt"
ekman_mapping_file = "./ekman_mapping.json"

In [None]:
with open(ekman_mapping_file, "r") as f:
    ekman_mapping = json.load(f)
ekman_mapping['neutral'] = ['neutral']

with open(sentiment_mapping_file, "r") as f:
    sentiment_mapping = json.load(f)
sentiment_mapping['neutral'] = ['neutral']

emotions = []
with open(emotions_file) as f:
    emotions = f.readlines()
emotions = [x.strip() for x in emotions]

In [None]:
emotion_to_index = dict(zip(emotions, range(len(emotions))))
index_to_emotion = dict(zip(range(len(emotions)), emotions))
ekman_mapping_reverse = {x:key for key, value in ekman_mapping.items() for x in value}
ekman_classes = dict(zip(ekman_mapping.keys(), range(len(ekman_mapping.keys()))))
sentiment_mapping_reverse = {x:key for key, value in sentiment_mapping.items() for x in value}
sentiment_classes = dict(zip(sentiment_mapping.keys(), range(len(sentiment_mapping.keys()))))

In [None]:
ekman_encoder = MultiLabelBinarizer(classes=range(len(ekman_mapping.keys())))
sentiment_encoder = MultiLabelBinarizer(classes=range(len(sentiment_mapping.keys())))

In [None]:
def get_ekman_from_emotion_index(idx):
    idx = int(idx)
    if idx < 0 or idx > 27:
        return None
    return ekman_classes[ekman_mapping_reverse[index_to_emotion[idx]]]

def get_sentiment_from_emotion_index(idx):
    idx = int(idx)
    if idx < 0 or idx > 27:
        return None
    return sentiment_classes[sentiment_mapping_reverse[idx]]

In [None]:
class GoEmotionsDataset(Dataset):
    def __init__(self, file, encoder, ekman=True):
        self.samples = []
        with open(file, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter='\t')
            for row in reader:
                text = row[0]
                labels = row[1].split(',')
                class_labels = []
                for label in labels:
                    class_label = get_ekman_from_emotion_index(label)
                    if class_label not in class_labels:
                        class_labels.append(class_label)
                encoded_labels = encoder.fit_transform([class_labels])[0]
                self.samples.append({'text': text, 'labels': encoded_labels})

  
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        return self.samples[index]

In [None]:
devset = GoEmotionsDataset(dev_file, ekman_encoder)
trainset = GoEmotionsDataset(train_file, ekman_encoder)
testset = GoEmotionsDataset(test_file, ekman_encoder)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device "{device}"')
print(torch.cuda.get_device_name())

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
config = AutoConfig.from_pretrained('bert-base-cased', dropout=0.1, num_labels=7)
model = AutoModelForSequenceClassification.from_config(config)
model.to(device)

In [None]:
batch_size = 16
num_epochs = 4
warmup_proportion = 0.1
num_training_steps = int((len(trainset) / batch_size) * num_epochs)
num_warmup_steps = int(warmup_proportion * num_training_steps)

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ["LayerNorm", "layer_norm", "bias"]
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
optimizer = optimization.AdamW(optimizer_grouped_parameters, lr=5e-5,betas=(0.9, 0.999), eps=1e-6)
scheduler = optimization.get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, lr_end=0.0)
criterion = nn.BCEWithLogitsLoss()

In [None]:
trainloader = DataLoader(trainset, batch_size=16, shuffle=True)
devloader = DataLoader(devset, batch_size=16, shuffle=False)
dev_labels = [x['labels'].tolist() for x in devset]

max_len = 50
epochs = 4
start_epoch = 1
checkpoint_file = None

min_dev_loss = math.inf
threshold = 0.3
train_losses = []
dev_losses = []
dev_f1 = []
dev_precision = []
dev_recall = []

for e in range(start_epoch, epochs+1):
    model.train()
    running_loss = 0.0
    for batch in tqdm(trainloader):
        encoded_dict = tokenizer.batch_encode_plus(batch['text'], padding=True, max_length=max_len, truncation=True)
        input_ids = torch.tensor(encoded_dict['input_ids'], dtype=torch.int64).to(device)
        attention_mask = torch.tensor(encoded_dict['attention_mask'], dtype=torch.int64).to(device)
        batch_labels = torch.squeeze(batch['labels'].type(torch.FloatTensor)).to(device)

        optimizer.zero_grad()
        output = model(input_ids, attention_mask=attention_mask)
        logits = output[0]
        loss = criterion(logits, batch_labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()

    running_loss = running_loss/len(trainset)
    train_losses.append(running_loss)
    print("Train Loss: ", running_loss)

    model.eval()
    running_dev_loss = 0.0
    predictions = []

    with torch.no_grad():
        for batch in tqdm(devloader):

            encoded_dict = tokenizer.batch_encode_plus(batch['text'], padding=True, max_length=max_len, truncation=True)
            input_ids = torch.tensor(encoded_dict['input_ids'], dtype=torch.int64).to(device)
            attention_mask = torch.tensor(encoded_dict['attention_mask'], dtype=torch.int64).to(device)
            batch_labels = torch.squeeze(batch['labels'].type(torch.FloatTensor)).to(device)

            output = model(input_ids, attention_mask=attention_mask)
            logits = output[0]
            loss = criterion(logits, batch_labels)
            logits = nn.functional.sigmoid(logits)
            preds = [[1 if x>threshold else 0 for x in arr ] for arr in logits.cpu()]

            running_dev_loss += loss.item()
            predictions.extend(preds)
  
        running_dev_loss = running_dev_loss/len(devset)
        dev_losses.append(running_dev_loss)

    f1 = f1_score(dev_labels, predictions, average='micro')
    precision = precision_score(dev_labels, predictions, average='micro')
    recall = recall_score(dev_labels, predictions, average='micro')

    dev_f1.append(f1)
    dev_precision.append(precision)
    dev_recall.append(recall)

    print("Loss: ", running_dev_loss)
    print("F1: ", dev_f1)
    print("Precision: ", dev_precision)
    print("Recall: ", dev_recall)

In [None]:
torch.save({"epoch": e,
            "model": model.state_dict(),
            "f1": f1,
            "min_loss": min_dev_loss,
            "train_losses": train_losses,
            "dev_losses": dev_losses,
            "dev_f1": dev_f1,
            "dev_precision": dev_precision,
            "dev_recall": dev_recall
            },
            "final.pth.tar")

In [None]:
df_final['normalized_score'] = df_final.groupby('subreddit')['score'].transform(lambda x: (x-x.min())/(x.max()-x.min()))

In [None]:
df_top = df_final[df_final['normalized_score'] >= df_final['normalized_score'].quantile(0.95)]

In [None]:
len(df_final['text'].unique())

In [None]:
histo = sns.displot(df_top, x='emotion', shrink=0.8)
histo.savefig('histogram.png')

In [None]:
sns.set_theme(style="whitegrid", color_codes=True)
histo = sns.displot(df_top, x='normalized_score', hue='emotion', kind='kde')
#histo.savefig('histogram.png')

In [None]:
!pip install wordcloud
import wordcloud

In [None]:
sns.set_theme(style="whitegrid", color_codes=True)
sns.catplot(x='emotion', y='normalized_score', data=df_top, kind='box')

In [None]:
sns.set_theme(style="ticks", color_codes=True)
fig = sns.catplot(x='emotion', y='normalized_score', data=df_top)
#sns.displot(df_final, x='normalized_score', hue='emotion')
fig.savefig('scatterplot.png')

In [None]:
groups1 = df_final.groupby('emotion')
groups2 = df_top.groupby('emotion')

In [None]:
fig, axes = plt.subplots(nrows=1,ncols=2, figsize=(25, 30))
groups1['text'].count().plot.pie(ax = axes[0], subplots=True)
groups2['text'].count().plot.pie(ax = axes[1], subplots=True)
#df_top.plot(ax = axes[1], subplots=True).pie(y='text')

In [None]:
groups1[['normalized_score']].median()

In [None]:
groups1['normalized_score'].mean()

In [None]:
groups2['normalized_score'].mean().sort_values(ascending=False)

In [None]:
df_sentiment = df_final.replace(to_replace={emotion:'joy'}, value='positive')
df_sentiment = df_sentiment.replace(to_replace={emotion:'anger'}, value='negative')
df_sentiment = df_sentiment.replace(to_replace={emotion:'sadness'}, value='negative')
df_sentiment = df_sentiment.replace(to_replace={emotion:'disgust'}, value='negative')
df_sentiment = df_sentiment.replace(to_replace={emotion:'fear'}, value='negative')
df_sentiment = df_sentiment.replace(to_replace={emotion:'surprise'}, value='ambiguous')

In [None]:
emotion='emotion'

In [None]:
grid = sns.FacetGrid(df_sentiment, col='subreddit', col_wrap=6)
grid.map(sns.histplot, 'emotion', shrink=0.8)

In [None]:
df_sentiment = df_sentiment[df_sentiment['normalized_score'] < 0.05]
df_sentiment = df_sentiment[df_sentiment['normalized_score'] > 0.04]
df_sentiment.drop('binned', axis=1)
df_sentiment['binned'] = df_sentiment['normalized_score'].apply(lambda x: 0.0005*np.floor(x/0.0005))

In [None]:
count_a = df_sentiment[df_sentiment['subreddit']=='aww'].groupby(['binned', 'emotion']).size().reset_index()
count_b = df_sentiment[df_sentiment['subreddit']=='news'].groupby(['binned', 'emotion']).size().reset_index()
count_a = pd.DataFrame(count_a)
count_b = pd.DataFrame(count_b)

In [None]:
df_aww = df_sentiment[df_sentiment['subreddit']=='aww']
df_news = df_sentiment[df_sentiment['subreddit']=='news']
df_politics = df_sentiment[df_sentiment['subreddit']=='politics']
df_worldnews = df_sentiment[df_sentiment['subreddit']=='worldnews']
df_aita = df_sentiment[df_sentiment['subreddit']=='AmItheAsshole']
df_trashy = df_sentiment[df_sentiment['subreddit']=='trashy']
df_market = df_sentiment[df_sentiment['subreddit']=='Market76']

In [None]:
pd.concat([df_aww, df_news, df_politics, df_worldnews]).groupby(['subreddit', 'emotion'])['normalized_score'].mean()

In [None]:
df_news.groupby('emotion')['normalized_score'].mean()

In [None]:
sns.catplot(x='emotion', y='normalized_score', data=df_news)

In [None]:
sns.lineplot(x=count_a['binned'], y=count_a[0], hue=count_a['emotion'])

In [None]:
sns.lineplot(x=count_b['binned'], y=count_b[0], hue=count_b['emotion'])