In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd "/content/gdrive/MyDrive/CSC 583 Text Retrieval"

/content/gdrive/.shortcut-targets-by-id/1rxvOfGO8GUhjrZK8B0UxXwAOPf-cjgvi/CSC 583 Text Retrieval


In [None]:
import pandas as pd

In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base', use_fast=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Load Test dataset

In [None]:
DATA  = "./Dataset/annotated-100-tweets"
df = pd.read_csv(DATA)
df

FileNotFoundError: ignored

In [None]:
df = df[['tweet', 'hate_speech']]
df

Unnamed: 0,tweet,hate_speech
0,!!! RT @mayasolovely: As a woman you shouldn't...,0
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,0
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,0
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,0
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,0
...,...,...
24778,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,0
24779,"you've gone and broke the wrong heart baby, an...",0
24780,young buck wanna eat!!.. dat nigguh like I ain...,0
24781,youu got wild bitches tellin you lies,0


In [None]:
#create custom dataset 
import torch
from torch.utils.data import Dataset
class TweetDataset(Dataset):

    def __init__(self, encodings, labels):
      self.encodings = encodings
      self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['label'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
MAX_LENGTH = 128
def create_dataset(dataframe):
  inputs = {
          "input_ids":[],
          "attention_mask":[]
        }

  sents = dataframe['tweet'].values.tolist()
  for sent in sents:
    tokenized_input = tokenizer(sent,max_length=MAX_LENGTH, padding='max_length', truncation = True)
    inputs["input_ids"].append(torch.tensor(tokenized_input["input_ids"]))
    inputs["attention_mask"].append(torch.tensor(tokenized_input["attention_mask"]))

  labels = torch.tensor([0]*dataframe.shape[0])

  return TweetDataset(inputs, labels)

test_dataset = create_dataset(df)


## Load Model

In [None]:
class_weights = torch.load('./NN_class_weights.t')

In [None]:
from sklearn.utils import compute_class_weight
import torch.nn as nn
from transformers import Trainer, TrainingArguments
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
def get_class_weights(dataframe,LABEL_COLUMN):
  """computes the class weight and returns a list to account for class imbalance """
  labels = torch.tensor(dataframe[LABEL_COLUMN].values.tolist())
  class_weights=compute_class_weight( class_weight ='balanced',classes = np.unique(labels),y = labels.numpy())
  class_weight_dict = dict(zip(np.unique(labels), class_weights))
  total_class_weights =[]
  for i in range(2):
    if i not in class_weight_dict:
      total_class_weights.append(1) #class_weight 1 for unseen labels
    else:
      total_class_weights.append(class_weight_dict[i])
  total_class_weights =torch.tensor(total_class_weights,dtype=torch.float).to(device)
  return total_class_weights

def create_custom_trainer(class_weights):
  """creates custom trainer that accounts for class imbalance"""
  class CustomTrainer(Trainer):
      def compute_loss(self, model, inputs, return_outputs=False):
          labels = inputs.get("labels")
          # forward pass
          outputs = model(**inputs)
          logits = outputs.get("logits")
          # compute custom loss 
          loss_fct = nn.CrossEntropyLoss(weight=class_weights)
          loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
          return (loss, outputs) if return_outputs else loss
  return CustomTrainer
CustomTrainer = create_custom_trainer(class_weights)

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
import numpy as np
from datasets import load_metric
f1_metric =load_metric("f1")
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=1)
  return  f1_metric.compute(predictions=predictions, references=labels)

  f1_metric =load_metric("f1")


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
num_labels = 2 # binary classification
path = "./my_models/improved_bertweet"
model = BertForSequenceClassification.from_pretrained(path)

In [None]:
from transformers import Trainer, TrainingArguments
output_dir="./classifier_prediction"
test_args = TrainingArguments(
    output_dir = output_dir,
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 16,   
)

test_trainer = CustomTrainer( 
    model=model,
    args=test_args,
    compute_metrics=compute_metrics
)

## Apply model onto test dataset
### This would be the dataset that has 100 tweets and is annotated

In [None]:
output = test_trainer.predict(test_dataset)
output

KeyboardInterrupt: ignored

In [None]:
import torch.nn.functional as F
import numpy as np
predictions = output.predictions
probabilities = F.softmax(torch.from_numpy(predictions), dim=-1)
pred_labels = np.argmax(predictions, axis=1)

In [None]:
# save prediction result 
import numpy as np
np.save('./classifier_100_tweets_test_dataset_prediction.npy', output.predictions) # save

In [None]:
# calculate precision and recall
from sklearn.metrics import classification_report
print(classification_report(gold_labels, pred_labels))