In [1]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Step 1: Install the required libraries
! pip install transformers 

import pickle
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
from transformers import BertTokenizer,BertForSequenceClassification

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from torch import cuda
device = torch.device('cuda' if cuda.is_available() else 'cpu')

print(f"Current device: {device}")

Current device: cuda


In [5]:
# Step 2: Load the fine-tuned BERT model and its tokenizer

model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/finetuned_bert",num_labels=6)
tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/finetuned_bert', do_lower_case=True)

In [22]:
# Step 3: Load and preprocess your CSV dataset
import pandas as pd
predict_data = pd.read_csv("/content/drive/MyDrive/preprocessed_Reddit_Data_1.csv")

In [23]:
columns_to_keep = ['Title']
predict_data = predict_data[columns_to_keep]

In [24]:
class_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
label_df = pd.DataFrame(columns=class_names)
test_data = pd.concat([predict_data, label_df], axis=1)
test_data[class_names] = -1  # Set label column values to -1
test_data = test_data.replace(-1, 0)  # Change -1 to 0 in the entire DataFrame

cols = test_data.columns
label_cols = list(cols[1:])

test_data = test_data[~test_data[class_names].eq(-1).any(axis=1)]  # Remove irrelevant rows/comments with -1 values
test_data['labels'] = list(test_data[class_names].values)

In [25]:
test_data

Unnamed: 0,Title,toxic,severe_toxic,obscene,threat,insult,identity_hate,labels
0,UkrainianConflict Discussion Megathread,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
1,Zelenskyy survives over 12 assassination attem...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
2,In the first round of presidential elections i...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
3,"A further 20,000 Ukrainian recruits will be tr...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
4,"Zelensky: ""Tanks, APCs and artillery are exact...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...,...
971,Hundreds of US military vehicles arrive in Dut...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
972,BREAKING: Poland will deliver a company of Leo...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
973,Russian airline aircraft suffer massive breakd...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
974,"The Russian Federation declared that it ""has t...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"


In [26]:
# Gathering input data
test_labels = list(test_data.labels.values)
test_comments = list(test_data.Title.values)

In [27]:
# Encoding input data
test_encodings = tokenizer.batch_encode_plus(test_comments,max_length=128,pad_to_max_length=True)
test_input_ids = test_encodings['input_ids']
test_token_type_ids = test_encodings['token_type_ids']
test_attention_masks = test_encodings['attention_mask']

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [28]:
# Make tensors out of data
test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_attention_masks)
test_token_types = torch.tensor(test_token_type_ids)
# Create test dataloader
test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32)
# Save test dataloader
torch.save(test_dataloader,'test_data_loader')

In [29]:
# Test

model.to(device)

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

#track variables
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels, b_token_types = batch
  with torch.no_grad():
    # Forward pass
    outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]
# Converting flattened binary values to boolean values
true_bools = [tl==0 for tl in true_labels]

In [30]:
pred_bools = [pl>0.40 for pl in pred_labels] #boolean output after thresholding

# Print and save classification report
print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools,average='micro'))
print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools),'\n')

Test F1 Accuracy:  0.8663246539541187
Test Flat Accuracy:  0.039959016393442626 



In [31]:
idx2label = dict(zip(range(6),label_cols))
print(idx2label)

{0: 'toxic', 1: 'severe_toxic', 2: 'obscene', 3: 'threat', 4: 'insult', 5: 'identity_hate'}


In [32]:
import numpy as np

# Getting indices of where boolean one hot vector true_bools is True so we can use idx2label to gather label names
true_label_idxs, pred_label_idxs=[],[]
for vals in true_bools:
  true_label_idxs.append(np.where(vals)[0].flatten().tolist())
for vals in pred_bools:
  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())

In [33]:
# Gathering vectors of label names using idx2label
true_label_texts, pred_label_texts = [], []
for vals in true_label_idxs:
  true_label_texts.append([idx2label[val] if val in idx2label else None for val in vals])

for vals in pred_label_idxs:
  pred_label_texts.append([idx2label[val] if val in idx2label else None for val in vals])

In [34]:
# Decoding input ids to reviews
reviews = [tokenizer.decode(text,skip_special_tokens=True,clean_up_tokenization_spaces=False) for text in tokenized_texts]

In [35]:
# Converting lists to df
comparisons_df = pd.DataFrame({'reviews': reviews,'pred_labels':pred_label_texts})
comparisons_df.to_csv('/content/drive/MyDrive/test_predictions.csv')
comparisons_df

Unnamed: 0,reviews,pred_labels
0,ukrainianconflict discussion megathread,"[toxic, severe_toxic, threat, insult, identity..."
1,zelenskyy survives over 12 assassination attem...,"[toxic, severe_toxic, threat, insult, identity..."
2,in the first round of presidential elections i...,"[toxic, severe_toxic, threat, identity_hate]"
3,"a further 20 , 000 ukrainian recruits will be ...","[toxic, severe_toxic, threat, identity_hate]"
4,"zelensky : "" tanks , apcs and artillery are ex...","[toxic, severe_toxic, threat, insult, identity..."
...,...,...
971,hundreds of us military vehicles arrive in dut...,"[toxic, severe_toxic, threat, insult, identity..."
972,breaking : poland will deliver a company of le...,"[toxic, severe_toxic, threat, insult, identity..."
973,russian airline aircraft suffer massive breakd...,"[toxic, severe_toxic, threat, identity_hate]"
974,"the russian federation declared that it "" has ...","[toxic, severe_toxic, threat, identity_hate]"
