In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd "/content/gdrive/MyDrive/CSC 583 Text Retrieval"

In [None]:
import pandas as pd

## Prepare Model

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import torch
print(torch.__version__)
print(torch.version.cuda)

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
!pip install transformers
!pip install datasets

### Loading dataset to fine tune the model

In [None]:
DATA2  = "./Dataset/old_data_kaggle.csv"
#DATA2  = "./Dataset/NN_improve_tweets.csv"
df2 = pd.read_csv(DATA2)
df2

In [None]:
df2 = df2[['tweet','label']]
#df2 = df2[['text','hate']]

In [None]:
# Define the label mapping
label_map = {
    'normal': 0,
    'offensive': 1,
    'hateful': 2
}

output_list = []
for i in df2.index:
  if df2.loc[i, 'hate'] == 1 or  df2.loc[i, 'hate'] == 2:
    output_list.append([df2.loc[i, 'text'], 1])
  else:
    output_list.append([df2.loc[i, 'text'], 0])

output_df = pd.DataFrame(output_list, columns=['text', 'hate'])
print(output_df)
df2 = output_df

In [None]:
import numpy as np

# Split the data into training and validation sets
train_df, dev_df, test_df =  np.split(df2.sample(frac=1, random_state=42),[int(.6*len(df2)), int(.8*len(df2))])
print(train_df.shape, dev_df.shape, test_df.shape)

In [None]:
#create custom dataset 
import torch
from torch.utils.data import Dataset
class TweetDataset(Dataset):

    def __init__(self, encodings, labels):
      self.encodings = encodings
      self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['label'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
MAX_LENGTH = 128
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base', use_fast=True)

TEXT_COL = "tweet"
LABEL_COL = "label"

# TEXT_COL = "text"
# LABEL_COL = "hate"

def create_dataset(dataframe, TEXT_COL, LABEL_COL):
  inputs = {
          "input_ids":[],
          "attention_mask":[]
        }

  sents = dataframe[TEXT_COL].values.tolist()
  for sent in sents:
    tokenized_input = tokenizer(sent,max_length=MAX_LENGTH, padding='max_length', truncation = True)
    inputs["input_ids"].append(torch.tensor(tokenized_input["input_ids"]))
    inputs["attention_mask"].append(torch.tensor(tokenized_input["attention_mask"]))
  # Create a TensorDataset from the input data
  labels = torch.tensor(dataframe[LABEL_COL].values.tolist())
  return TweetDataset(inputs, labels)

train_dataset = create_dataset(train_df, TEXT_COL, LABEL_COL)
dev_dataset = create_dataset(dev_df, TEXT_COL, LABEL_COL)
test_dataset = create_dataset(test_df, TEXT_COL, LABEL_COL)
print(test_dataset)


In [None]:
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification

### Hyperparameter tuning

In [None]:
!pip install optuna
!pip install datasets

In [None]:
from sklearn.utils import compute_class_weight
import torch.nn as nn
from transformers import Trainer, TrainingArguments
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
def get_class_weights(dataframe,LABEL_COLUMN):
  """computes the class weight and returns a list to account for class imbalance """
  labels = torch.tensor(dataframe[LABEL_COLUMN].values.tolist())
  class_weights=compute_class_weight( class_weight ='balanced',classes = np.unique(labels),y = labels.numpy())
  class_weight_dict = dict(zip(np.unique(labels), class_weights))
  total_class_weights =[]
  for i in range(2):
    if i not in class_weight_dict:
      total_class_weights.append(1) #class_weight 1 for unseen labels
    else:
      total_class_weights.append(class_weight_dict[i])
  total_class_weights =torch.tensor(total_class_weights,dtype=torch.float).to(device)
  return total_class_weights

def create_custom_trainer(class_weights):
  """creates custom trainer that accounts for class imbalance"""
  class CustomTrainer(Trainer):
      def compute_loss(self, model, inputs, return_outputs=False):
          labels = inputs.get("labels")
          # forward pass
          outputs = model(**inputs)
          logits = outputs.get("logits")
          # compute custom loss 
          loss_fct = nn.CrossEntropyLoss(weight=class_weights)
          loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
          return (loss, outputs) if return_outputs else loss
  return CustomTrainer
class_weights = get_class_weights(train_df,'label')
CustomTrainer = create_custom_trainer(class_weights)

In [None]:
from datasets import load_metric
f1_metric =load_metric("f1")
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=1)
  return  f1_metric.compute(predictions=predictions, references=labels)

In [None]:
import sklearn
from sklearn.metrics import accuracy_score   
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments

import datetime
from datetime import datetime
import optuna

num_labels = 2 # binary classification
model = BertForSequenceClassification.from_pretrained('vinai/bertweet-base',num_labels = num_labels)


def objective(trial):
    # Define hyperparameters to tune
    learning_rate = trial.suggest_float("learning_rate", 1e-5,1e-3, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 3,3)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32, 64])
    per_device_eval_batch_size = per_device_train_batch_size 

    output_dir = "./results_old/"+str(datetime.now())
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        push_to_hub=False,
        logging_dir="./logs",
    )

    # Train the model
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        compute_metrics=compute_metrics
    )


    trainer.train()
    output = trainer.predict(dev_dataset)
    predictions = np.argmax(output.predictions, axis=1)
    f1 =  f1_metric.compute(predictions=predictions, references=output.label_ids)['f1']
    print(f1)
    return f1

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

### Train model with best performing hyperparameters

In [None]:
import datetime
from datetime import datetime
from transformers import Trainer, TrainingArguments
from datasets import load_metric
f1_metric =load_metric("f1")
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=1)
  return  f1_metric.compute(predictions=predictions, references=labels)
# Define the training arguments

learning_rate = 2e-5
epochs = 3
batch_size = 16
#

output_dir = "./results_old/"+str(datetime.now())

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate = learning_rate,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    load_best_model_at_end=True,
     metric_for_best_model = 'f1',
     eval_steps = 500,
    do_train = True,
  do_eval = True
)

num_labels = 2 # binary classification
model = BertForSequenceClassification.from_pretrained('vinai/bertweet-base',num_labels = num_labels)

# Tell pytorch to run this model on the GPU.
# Define the trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune the model
trainer.train()

### Load the finetuned model

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Load the model and tokenizer
num_labels = 2 # binary classification
path = "./results_old/2023-05-02 14:23:11.142440/checkpoint-3500"
model = BertForSequenceClassification.from_pretrained(path)

In [None]:
test_args = TrainingArguments(
    output_dir = "./prediction-results",
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 16,   
)
test_trainer = Trainer( 
    model=model,
    args=test_args,
    compute_metrics=compute_metrics
)

In [None]:
results = trainer.evaluate(test_dataset)
results

In [None]:
# get precision and recall scores
import torch.nn.functional as F
output = trainer.predict(test_dataset)
probabilities = F.softmax(torch.from_numpy(output.predictions), dim=-1)
pred_labels = np.argmax(output.predictions, axis=1)

# get the gold labels of the test dataset
gold_labels = []
for x in test_df['hate'].values.tolist():
  if x == 0 or x == 1:
    gold_labels.append(x)
  else:
    gold_labels.append(1)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(gold_labels, pred_labels))

## Apply the finetuned model to extract hate speech tweets and generate query

In [None]:
DATA  = "./Dataset/davidson_data.csv"
df = pd.read_csv(DATA)
df

In [None]:
df = df[['tweet', 'hate_speech']]
df

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

num_labels = 2 # binary classification
# path = "/content/gdrive/MyDrive/CSC 583 Text Retrieval/my_models/improved_bertweet" # recent dataset finetuned 
path = "/content/gdrive/MyDrive/CSC 583 Text Retrieval/combined_1_1_finetuned"
model = BertForSequenceClassification.from_pretrained(path)


In [None]:
# apply the model on the original dataset 
MAX_LENGTH = 128
def create_dataset(dataframe):
  inputs = {
          "input_ids":[],
          "attention_mask":[]
        }

  sents = dataframe['tweet'].values.tolist()
  for sent in sents:
    tokenized_input = tokenizer(sent,max_length=MAX_LENGTH, padding='max_length', truncation = True)
    inputs["input_ids"].append(torch.tensor(tokenized_input["input_ids"]))
    inputs["attention_mask"].append(torch.tensor(tokenized_input["attention_mask"]))

  labels = torch.tensor([0]*dataframe.shape[0])

  return TweetDataset(inputs, labels)

test_dataset = create_dataset(df)

In [None]:
# Compute predictions using Trainer
from transformers import Trainer, TrainingArguments
output_dir="./prediction"
test_args = TrainingArguments(
    output_dir = output_dir,
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 16,   
)

test_trainer = Trainer(model=model, args =test_args)
output = test_trainer.predict(test_dataset)
output
# save prediction result 
import numpy as np
#np.save('./combined_new_dataset_finetuned_davidson_prediction.npy', output.predictions) # save

In [None]:
import torch.nn.functional as F
probabilities = F.softmax(torch.from_numpy(output.predictions), dim=-1)
pred_labels = np.argmax(output.predictions, axis=1)

In [None]:
# sort by high probability
high_prob = torch.max(probabilities, dim = 1)
print(high_prob)

In [None]:
sorted, index = high_prob.values.sort(descending=True)
print(sorted, index) # we know the index of the tweets that have high prob

In [None]:
# query generation
def generate_hatespeech_query(index):
  index_val = index.numpy().tolist()
  sorted_val = sorted.numpy().tolist()
  test_dataset_text = df['tweet'].values.tolist() # use davidson data to apply the improved model
  print(index_val)
  cnt = 0 

  hate_speech_query = ""
  for i in index_val:
    if pred_labels[i] == 1:
      
      print(test_dataset_text[i], pred_labels[i], sorted_val[i])
      processed_tweet = " ".join(filter(lambda x:x[0]!='@', test_dataset_text[i].split()))
      print(processed_tweet)
      hate_speech_query += processed_tweet+ " "
      cnt += 1
    if cnt == 10:
      break
  
  return hate_speech_query

improved_query = generate_hatespeech_query(index)


In [None]:
f = open("./combined_1_1_hatespeech_query", "w")
f.write(improved_query)
f.close()

In [None]:
# get the gold labels of the test dataset
gold_labels = []
for x in df['label'].values.tolist():
  if x == 0 or x == 1:
    gold_labels.append(x)
  else:
    gold_labels.append(1)

In [None]:
# get precision and recall score
from sklearn.metrics import classification_report
print(classification_report(gold_labels, pred_labels))