In [52]:
import pandas as pd
from transformers import RobertaTokenizer, AdamW
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
import numpy as np
import tensorflow as tf
from torch.nn import BCEWithLogitsLoss, BCELoss
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle
from tqdm import tqdm, trange
from ast import literal_eval

In [41]:
from transformers import RobertaForSequenceClassification

In [3]:
# load training data
df = pd.read_csv('./data/training_set.csv', delimiter=',')
df.head()

Unnamed: 0,sentence,labels
0,Every actor along the agriculture supply chain...,2
1,The recent increase in food insecurity was pri...,2
2,Number of companies publishing sustainability ...,12
3,Much work has been done recently to improve ci...,16
4,"Of the 172 countries that reported in 2018, 60...",6


In [4]:
print('Unique comments: ', df.sentence.nunique() == df.shape[0])
print('Null values: ', df.isnull().values.any())
print('average sentence length: ', df.sentence.str.split().str.len().mean())
print('stdev sentence length: ', df.sentence.str.split().str.len().std())

Unique comments:  False
Null values:  False
average sentence length:  22.248170094109447
stdev sentence length:  11.464921040506532


In [5]:
pd.set_option('display.max_rows', 1000)
df[df.sentence.duplicated(keep=False)].sort_values('sentence')

Unnamed: 0,sentence,labels
46,Access to financial services.,10
1466,Access to financial services.,9
1442,Access to quality essential health care servic...,1
938,Access to quality essential health care servic...,3
5151,Adopt the Children’s Rights and Business Princ...,4
2249,Adopt the Children’s Rights and Business Princ...,5
3860,Apply these principles to help maximize the po...,4
3593,Apply these principles to help maximize the po...,5
3290,Availability of a skilled workforce.,4
2177,Availability of a skilled workforce.,8


In [6]:
df.labels.value_counts()

0     600
17    470
3     410
16    365
6     334
15    331
12    324
2     295
5     291
11    287
8     277
1     274
10    268
4     258
13    257
14    235
7     235
9     227
Name: labels, dtype: int64

In [7]:
# RoBERTa model requires single column one-hot-encoded labels 
# this function returns this format from the usual numeric class label
def one_hot_encode(class_number):
    """
    Gets an integer as argument representing the target class label. 
    As RoBERTa needs the targets in tuple one_hot_encoded format, 
    a tuple is returned which has a 1 at the position of the target class. 
    E.g. 2 -> (0,0,1,0,0,...)
    """
    num_classes = 18
    t = [0 for i in range(num_classes)]
    t[class_number] = 1
    return tuple(t)

# apply one hot encoding.
df['one_hot_labels'] = df['labels'].apply(lambda x: one_hot_encode(x))
df.head()

Unnamed: 0,sentence,labels,one_hot_labels
0,Every actor along the agriculture supply chain...,2,"(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,The recent increase in food insecurity was pri...,2,"(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Number of companies publishing sustainability ...,12,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
3,Much work has been done recently to improve ci...,16,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"Of the 172 countries that reported in 2018, 60...",6,"(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
sentences = list(df.sentence)
labels = list(df.one_hot_labels)

In [27]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False) # tokenizer
#encoded_input = tokenizer(sentences, return_tensors='pt')
encodings = tokenizer.batch_encode_plus(sentences, return_token_type_ids=True, padding=True)
#encodings = tokenizer.batch_encode_plus(comments,max_length=max_length,pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())

tokenizer outputs:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [28]:
input_ids = encodings['input_ids'] # tokenized and encoded sentences
token_type_ids = encodings['token_type_ids'] # token type ids
attention_masks = encodings['attention_mask'] # attention masks

In [31]:
# Use train_test_split to split our data into train and validation sets

train_inputs, validation_inputs, train_labels, validation_labels, train_token_types, validation_token_types, train_masks, validation_masks = train_test_split(input_ids, labels, token_type_ids, attention_masks, random_state=42, test_size=0.10, stratify = labels)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_token_types = torch.tensor(train_token_types)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
validation_token_types = torch.tensor(validation_token_types)

In [66]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, )

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

nb_train_inputs = len(train_inputs)

In [39]:
torch.save(validation_dataloader,'validation_data_loader')
torch.save(train_dataloader,'train_data_loader')

In [43]:
# Load model, the pretrained model will include a single linear classification layer on top for classification. 
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=18)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [54]:
optimizer = AdamW(model.parameters(), lr=2e-5)  # Default optimization

In [60]:
import time

In [70]:
def calculate_remaining_time(start, time, nb_training_data):
    t_diff = time - start
    frac_todo = (nb_train_inputs - nb_training_data) / nb_train_inputs
    frac_done = 1 - frac_todo
    t_est = (t_diff / frac_done) - t_diff
    print('Est. Remaining Time Till Epoch: {}min'.format(round(t_est/60, 2)))

In [72]:
# Store our loss and accuracy for plotting
train_loss_set = []
num_labels=18
# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

    # Training
    start = time.time()

    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    # Tracking variables
    tr_loss = 0 #running loss
    nb_tr_examples, nb_tr_steps = 0, 0

    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        #batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels, b_token_types = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        
        # # Forward pass for multiclass classification
        # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        # loss = outputs[0]
        # logits = outputs[1]

        # Forward pass for multilabel classification
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]
        loss_func = BCEWithLogitsLoss() 
        loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
        # loss_func = BCELoss() 
        # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
        train_loss_set.append(loss.item())  
        
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        # scheduler.step()
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
        calculate_remaining_time(start, time.time(), nb_tr_examples)

    print("Train loss: {}".format(tr_loss/nb_tr_steps))

    ###############################################################################

    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Variables to gather full output
    logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

    # Predict
    for i, batch in enumerate(validation_dataloader):
        #batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels, b_token_types = batch
        with torch.no_grad():
            # Forward pass
            outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            b_logit_pred = outs[0]
            pred_label = torch.sigmoid(b_logit_pred)

            b_logit_pred = b_logit_pred.detach().cpu().numpy()
            pred_label = pred_label.to('cpu').numpy()
            b_labels = b_labels.to('cpu').numpy()

        tokenized_texts.append(b_input_ids)
        logit_preds.append(b_logit_pred)
        true_labels.append(b_labels)
        pred_labels.append(pred_label)

    # Flatten outputs
    pred_labels = [item for sublist in pred_labels for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]

    # Calculate Accuracy
    threshold = 0.50
    pred_bools = [pl>threshold for pl in pred_labels]
    true_bools = [tl==1 for tl in true_labels]
    val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
    val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

    print('F1 Validation Accuracy: ', val_f1_accuracy)
    print('Flat Validation Accuracy: ', val_flat_accuracy)

Epoch:   0%|                                                                  | 0/3 [00:00<?, ?it/s]

Est. Remaining Time Till Epoch: 87.53min
Est. Remaining Time Till Epoch: 87.16min
Est. Remaining Time Till Epoch: 86.28min
Est. Remaining Time Till Epoch: 85.67min
Est. Remaining Time Till Epoch: 85.14min
Est. Remaining Time Till Epoch: 84.49min
Est. Remaining Time Till Epoch: 83.88min
Est. Remaining Time Till Epoch: 83.73min
Est. Remaining Time Till Epoch: 83.29min
Est. Remaining Time Till Epoch: 83.23min
Est. Remaining Time Till Epoch: 82.7min
Est. Remaining Time Till Epoch: 82.11min
Est. Remaining Time Till Epoch: 81.52min
Est. Remaining Time Till Epoch: 80.97min
Est. Remaining Time Till Epoch: 80.41min
Est. Remaining Time Till Epoch: 79.82min
Est. Remaining Time Till Epoch: 79.23min
Est. Remaining Time Till Epoch: 78.68min
Est. Remaining Time Till Epoch: 78.09min
Est. Remaining Time Till Epoch: 77.56min
Est. Remaining Time Till Epoch: 76.97min
Est. Remaining Time Till Epoch: 76.4min
Est. Remaining Time Till Epoch: 75.84min
Est. Remaining Time Till Epoch: 75.28min
Est. Remaining Tim

Epoch:  33%|█████████████████▎                                  | 1/3 [1:31:39<3:03:18, 5499.44s/it]

F1 Validation Accuracy:  68.98803046789989
Flat Validation Accuracy:  55.22648083623694
Est. Remaining Time Till Epoch: 89.33min
Est. Remaining Time Till Epoch: 88.49min
Est. Remaining Time Till Epoch: 87.98min
Est. Remaining Time Till Epoch: 87.66min
Est. Remaining Time Till Epoch: 86.87min
Est. Remaining Time Till Epoch: 86.28min
Est. Remaining Time Till Epoch: 85.66min
Est. Remaining Time Till Epoch: 85.04min
Est. Remaining Time Till Epoch: 84.4min
Est. Remaining Time Till Epoch: 83.85min
Est. Remaining Time Till Epoch: 83.29min
Est. Remaining Time Till Epoch: 82.66min
Est. Remaining Time Till Epoch: 82.04min
Est. Remaining Time Till Epoch: 81.58min
Est. Remaining Time Till Epoch: 81.0min
Est. Remaining Time Till Epoch: 80.51min
Est. Remaining Time Till Epoch: 80.06min
Est. Remaining Time Till Epoch: 79.52min
Est. Remaining Time Till Epoch: 78.99min
Est. Remaining Time Till Epoch: 78.58min
Est. Remaining Time Till Epoch: 78.15min
Est. Remaining Time Till Epoch: 77.65min
Est. Remaini

Epoch:  67%|██████████████████████████████████▋                 | 2/3 [3:04:07<1:32:07, 5527.82s/it]

F1 Validation Accuracy:  78.44254510921176
Flat Validation Accuracy:  71.95121951219512
Est. Remaining Time Till Epoch: 88.63min
Est. Remaining Time Till Epoch: 87.68min
Est. Remaining Time Till Epoch: 86.86min
Est. Remaining Time Till Epoch: 86.37min
Est. Remaining Time Till Epoch: 85.95min
Est. Remaining Time Till Epoch: 85.31min
Est. Remaining Time Till Epoch: 84.92min
Est. Remaining Time Till Epoch: 84.38min
Est. Remaining Time Till Epoch: 83.81min
Est. Remaining Time Till Epoch: 83.27min
Est. Remaining Time Till Epoch: 82.77min
Est. Remaining Time Till Epoch: 82.21min
Est. Remaining Time Till Epoch: 81.66min
Est. Remaining Time Till Epoch: 81.09min
Est. Remaining Time Till Epoch: 80.52min
Est. Remaining Time Till Epoch: 80.02min
Est. Remaining Time Till Epoch: 79.45min
Est. Remaining Time Till Epoch: 78.87min
Est. Remaining Time Till Epoch: 78.33min
Est. Remaining Time Till Epoch: 77.77min
Est. Remaining Time Till Epoch: 77.22min
Est. Remaining Time Till Epoch: 76.67min
Est. Remai

Epoch: 100%|██████████████████████████████████████████████████████| 3/3 [4:36:08<00:00, 5522.87s/it]

F1 Validation Accuracy:  81.56934306569343
Flat Validation Accuracy:  77.87456445993031





#### Prediction on Test Set

In [121]:
test_df = pd.read_csv('./data/dev_test_set_u.csv')
print('Null values: ', test_df.isnull().values.any()) #should not be any null sentences or labels
test_df.head()

Null values:  False


Unnamed: 0,id,comp_name,sentence
0,07e45bf1b3a39e5d,engie,Our customers benefit from our energy-efficien...
1,07e45bf1b3a39e5d,engie,ENGIE are recruiting for a Mobile Contract Sup...
2,07e45bf1b3a39e5d,engie,"This is a permanent, full time role working 40..."
3,07e45bf1b3a39e5d,engie,"On offer is a salary of £35,000 - £37,500 depe..."
4,07e45bf1b3a39e5d,engie,Safe Supervision of all personnel including th...


In [123]:
dev_test = pd.read_csv('./data/dev_test_set_u.csv')
test_df = dev_test.iloc[:2000, :]
print('Null values: ', test_df.isnull().values.any()) #should not be any null sentences or labels
test_df.head()

Null values:  False


Unnamed: 0,id,comp_name,sentence
0,07e45bf1b3a39e5d,engie,Our customers benefit from our energy-efficien...
1,07e45bf1b3a39e5d,engie,ENGIE are recruiting for a Mobile Contract Sup...
2,07e45bf1b3a39e5d,engie,"This is a permanent, full time role working 40..."
3,07e45bf1b3a39e5d,engie,"On offer is a salary of £35,000 - £37,500 depe..."
4,07e45bf1b3a39e5d,engie,Safe Supervision of all personnel including th...


In [None]:
#test_df = test_df[~test_df[test_label_cols].eq(-1).any(axis=1)] #remove irrelevant rows/comments with -1 values
#test_df['one_hot_labels'] = list(test_df[test_label_cols].values)
#test_df.head()

In [76]:
# Gathering input data
#test_labels = list(test_df.one_hot_labels.values)
#test_comments = list(test_df.comment_text.values)
test_sentences = list(test_df.sentence.values)

In [124]:
# Encoding input data
test_encodings = tokenizer.batch_encode_plus(test_sentences, return_token_type_ids=True, padding=True)
test_input_ids = test_encodings['input_ids']
test_token_type_ids = test_encodings['token_type_ids']
test_attention_masks = test_encodings['attention_mask']

In [125]:
# Make tensors out of data
test_inputs = torch.tensor(test_input_ids)
#test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_attention_masks)
test_token_types = torch.tensor(test_token_type_ids)
# Create test dataloader
#test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_data = TensorDataset(test_inputs, test_masks, test_token_types)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [127]:
counter = 0
def predict(sentence):
    global counter
    test_encodings = tokenizer.encode_plus(sentence, return_token_type_ids=True, padding=True)
    b_input_ids = torch.tensor([test_encodings['input_ids']])
    b_input_mask = torch.tensor([test_encodings['token_type_ids']])
    b_token_types = torch.tensor([test_encodings['attention_mask']])
    
    with torch.no_grad():
        # Forward pass
        outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        b_logit_pred = outs[0]
        pred_label = torch.sigmoid(b_logit_pred)

        b_logit_pred = b_logit_pred.detach().cpu().numpy()
        pred_label = pred_label.to('cpu').numpy()
        prediction = list(pred_label[0]).index(max(list(pred_label[0]))) 
        
        counter += 1
        if (counter % 10) == 0: 
            print(counter)
        return prediction

In [140]:
dev_test.shape[0]

78653

In [128]:
model.eval()
test_df['model_most_likely'] = test_df['sentence'].apply(lambda x: predict(x))

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860
870
880
890
900
910
920
930
940
950
960
970
980
990
1000
1010
1020
1030
1040
1050
1060
1070
1080
1090
1100
1110
1120
1130
1140
1150
1160
1170
1180
1190
1200
1210
1220
1230
1240
1250
1260
1270
1280
1290
1300
1310
1320
1330
1340
1350
1360
1370
1380
1390
1400
1410
1420
1430
1440
1450
1460
1470
1480
1490
1500
1510
1520
1530
1540
1550
1560
1570
1580
1590
1600
1610
1620
1630
1640
1650
1660
1670
1680
1690
1700
1710
1720
1730
1740
1750
1760
1770
1780
1790
1800
1810
1820
1830
1840
1850
1860
1870
1880
1890
1900
1910
1920
1930
1940
1950
1960
1970
1980
1990
2000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['model_most_likely'] = test_df['sentence'].apply(lambda x: predict(x))


In [144]:
a = dev_test.shape[0]
merged = dev_test.merge(test_df[['sentence', 'model_most_likely']], on='sentence', how='outer')
b = merged.shape[0]
print(a == b)
c = merged[~merged['model_most_likely'].isnull()].shape[0]
print(c == 2000)

True
True
2000


In [145]:
merged.to_csv('./data/dev_test_set_u_prelabelled.csv', index=False)

In [86]:
# Test

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

#track variables
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
    #batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_token_types = batch
    with torch.no_grad():
        # Forward pass
        outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        b_logit_pred = outs[0]
        pred_label = torch.sigmoid(b_logit_pred)

        b_logit_pred = b_logit_pred.detach().cpu().numpy()
        pred_label = pred_label.to('cpu').numpy()

        #b_labels = b_labels.to('cpu').numpy()
        sentences = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) for ids in b_input_ids]
        preds = [list(pred).index(max(list(pred))) for pred in pred_label]
        for s, p in zip(sentences, preds):
            print(('-----\n'
                  'Sentence:\n'
                  '{}\n'
                  '-----\n'
                  'Most likely class: {}\n'
                  '-----\n').format(s, p))
            true_label = input('Please enter correct class: ')
            print('-----\n')
            true_labels.append(list(one_hot_encode(int(true_label))))
            print(true_labels)
        
    add_to_df
        
    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]
# Converting flattened binary values to boolean values
true_bools = [tl==1 for tl in true_labels]

-----
Sentence:
Purpose and overall relevance to the organization
-----
Most likely class: 0
-----



KeyboardInterrupt: Interrupted by user

In [None]:
pred_bools = [pl>0.50 for pl in pred_labels] #boolean output after thresholding

# Print and save classification report
print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools,average='micro'))
print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools),'\n')
clf_report = classification_report(true_bools,pred_bools,target_names=test_label_cols)
pickle.dump(clf_report, open('classification_report.txt','wb')) #save report
print(clf_report)

#### Create Output dataframe

In [None]:
idx2label = dict(zip(range(18),label_cols))
print(idx2label)

In [None]:
# Getting indices of where boolean one hot vector true_bools is True so we can use idx2label to gather label names
true_label_idxs, pred_label_idxs=[],[]
for vals in true_bools:
    true_label_idxs.append(np.where(vals)[0].flatten().tolist())
for vals in pred_bools:
    pred_label_idxs.append(np.where(vals)[0].flatten().tolist())

In [None]:
# Decoding input ids to comment text
comment_texts = [tokenizer.decode(text,skip_special_tokens=True,clean_up_tokenization_spaces=False) for text in tokenized_texts]

In [None]:
# Converting lists to df
comparisons_df = pd.DataFrame({'comment_text': comment_texts, 'true_labels': true_label_texts, 'pred_labels':pred_label_texts})
comparisons_df.to_csv('comparisons.csv')
comparisons_df.head()

#### Optimize Threshold

In [None]:
# Gathering vectors of label names using idx2label
true_label_texts, pred_label_texts = [], []
for vals in true_label_idxs:
    if vals:
    true_label_texts.append([idx2label[val] for val in vals])
    else:
    true_label_texts.append(vals)

for vals in pred_label_idxs:
    if vals:
    pred_label_texts.append([idx2label[val] for val in vals])
    else:
    pred_label_texts.append(vals)

In [None]:
# Calculate Accuracy - maximize F1 accuracy by tuning threshold values. First with 'macro_thresholds' on the order of e^-1 then with 'micro_thresholds' on the order of e^-2

macro_thresholds = np.array(range(1,10))/10

f1_results, flat_acc_results = [], []
for th in macro_thresholds:
    pred_bools = [pl>th for pl in pred_labels]
    test_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
    test_flat_accuracy = accuracy_score(true_bools, pred_bools)
    f1_results.append(test_f1_accuracy)
    flat_acc_results.append(test_flat_accuracy)

best_macro_th = macro_thresholds[np.argmax(f1_results)] #best macro threshold value

micro_thresholds = (np.array(range(10))/100)+best_macro_th #calculating micro threshold values

f1_results, flat_acc_results = [], []
for th in micro_thresholds:
    pred_bools = [pl>th for pl in pred_labels]
    test_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
    test_flat_accuracy = accuracy_score(true_bools, pred_bools)
    f1_results.append(test_f1_accuracy)
    flat_acc_results.append(test_flat_accuracy)

best_f1_idx = np.argmax(f1_results) #best threshold value

# Printing and saving classification report
print('Best Threshold: ', micro_thresholds[best_f1_idx])
print('Test F1 Accuracy: ', f1_results[best_f1_idx])
print('Test Flat Accuracy: ', flat_acc_results[best_f1_idx], '\n')

best_pred_bools = [pl>micro_thresholds[best_f1_idx] for pl in pred_labels]
clf_report_optimized = classification_report(true_bools,best_pred_bools, target_names=label_cols)
pickle.dump(clf_report_optimized, open('classification_report_optimized.txt','wb'))
print(clf_report_optimized)