In [None]:
import torch
import csv
import pandas as pd
import numpy as np
from torch import nn
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import optimization
from tqdm.notebook import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
with open(ekman_mapping_file, "r") as f:
    ekman_mapping = json.load(f)
ekman_mapping['neutral'] = ['neutral']

with open(sentiment_mapping_file, "r") as f:
    sentiment_mapping = json.load(f)
sentiment_mapping['neutral'] = ['neutral']

emotions = []
with open(emotions_file) as f:
    emotions = f.readlines()
emotions = [x.strip() for x in emotions]

In [None]:
emotion_to_index = dict(zip(emotions, range(len(emotions))))
index_to_emotion = dict(zip(range(len(emotions)), emotions))
ekman_mapping_reverse = {x:key for key, value in ekman_mapping.items() for x in value}
ekman_classes = dict(zip(ekman_mapping.keys(), range(len(ekman_mapping.keys()))))
sentiment_mapping_reverse = {x:key for key, value in sentiment_mapping.items() for x in value}
sentiment_classes = dict(zip(sentiment_mapping.keys(), range(len(sentiment_mapping.keys()))))

In [None]:
def get_ekman_from_emotion_index(idx):
    idx = int(idx)
    if idx < 0 or idx > 27:
        return None
    return ekman_classes[ekman_mapping_reverse[index_to_emotion[idx]]]

def get_sentiment_from_emotion_index(idx):
    idx = int(idx)
    if idx < 0 or idx > 27:
        return None
    return sentiment_classes[sentiment_mapping_reverse[idx]]

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device "{device}"')
print(torch.cuda.get_device_name())

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
config = AutoConfig.from_pretrained('bert-base-cased', dropout=0.1, num_labels=7)
model = AutoModelForSequenceClassification.from_config(config)
model.to(device)

In [None]:
!pip install kaggle

In [None]:
!mkdir ~/.kaggle/
!cp kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download smagnan/1-million-reddit-comments-from-40-subreddits
!unzip 1-million-reddit-comments-from-40-subreddits.zip

In [None]:
dataset_file = "kaggle_RC_2019-05.csv"
!head kaggle_RC_2019-05.csv

In [None]:
class RedditCommentsDataset(Dataset):
    def __init__(self, file):
        self.samples = []
        with open(file, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter=',')
            next(reader, None)
            for row in reader:
                if len(row) < 4:
                    continue
                text = row[1]
                subreddit = row[0]
                score = row[3]
                self.samples.append({'subreddit': subreddit, 'text': text, 'score': score})

  
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        return self.samples[index]

In [None]:
dataset = RedditCommentsDataset(dataset_file)

In [None]:
saved_model = torch.load('final.pth.tar')
model.load_state_dict(saved_model['model'])
model.eval()

In [None]:
dataloader = DataLoader(dataset, batch_size=16)
max_len = 50
threshold = 0.3
output_list = []
j=0
for batch in tqdm(dataloader):
    encoded_dict = tokenizer.batch_encode_plus(batch['text'], padding=True, max_length=max_len, truncation=True)
    input_ids = torch.tensor(encoded_dict['input_ids'], dtype=torch.int64).to(device)
    attention_mask = torch.tensor(encoded_dict['attention_mask'], dtype=torch.int64).to(device)
    output = model(input_ids, attention_mask=attention_mask)
    logits = output[0]
    logits = nn.functional.sigmoid(logits)
    preds = [[1 if x>threshold else 0 for x in arr ] for arr in logits.cpu()]

    for i in range(len(batch['text'])):
        output_list.append(
            {
              'text': batch['text'][i],
              'score': batch['score'][i],
              'subreddit': batch['subreddit'][i],
              'sentiment_labels': preds[i]
            }
        )

In [None]:
ekman_labeled_data = pd.DataFrame(output_list)

In [None]:
ekman_labeled_data.to_csv('./ekman_labeled_data.csv', index=False)