In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install multimodal-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting multimodal-transformers
  Downloading multimodal_transformers-0.2a0-py3-none-any.whl (22 kB)
Collecting networkx~=2.6.3
  Downloading networkx-2.6.3-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy~=1.21.6
  Downloading numpy-1.21.6-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m77.0 MB/s[0m eta [36m0:00:00[0m
Collecting tqdm~=4.64.1
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 KB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy~=1.7.3
  Downloading scipy-1.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.8 MB)
[2K     [90m━━━━━━━━━━━━━

In [4]:
from dataclasses import dataclass, field
import json
import logging
import os
from typing import Optional

import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoConfig,
    Trainer,
    EvalPrediction,
    set_seed
)
from transformers.training_args import TrainingArguments

from multimodal_transformers.data import load_data_from_folder
from multimodal_transformers.model import TabularConfig
from multimodal_transformers.model import AutoModelWithTabular

logging.basicConfig(level=logging.INFO)
os.environ['COMET_MODE'] = 'DISABLED'

In [5]:
import torch

if torch.cuda.is_available():    
    print("GPU Available, using GPU \n")
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

GPU Available, using GPU 



In [6]:
import pandas as pd

dataset_dir_path = 'drive/MyDrive/BTP/intentconan.csv'
columns = ["CounterSpeech", "HateSpeech" , "Length"]
data_df = pd.read_csv(dataset_dir_path , usecols = columns)
# train_df['Aggressive'] = train_df['Aggressive'].apply(lambda x: x-1)
data_df['Length'] = data_df['Length'].apply(lambda x: x-1)
data_df.dropna(inplace=True)
# data_df['Relevance'] = data_df['Relevance'].apply(lambda x: 1.0 if x == 2.0 or x == 3.0 else (2.0 if x == 4.0 or x == 5.0 else 0.0))
data_df.head(100)

Unnamed: 0,HateSpeech,CounterSpeech,Length
0,Maybe the UN could talk to those asian and afr...,The US is the second most polluting country in...,1.0
1,Maybe the UN could talk to those asian and afr...,Doesn't everyone on the planet have a responsi...,1.0
2,Maybe the UN could talk to those asian and afr...,The world would be a better place if people we...,0.0
3,Maybe the UN could talk to those asian and afr...,"You're right, ocean pollution is one of the ma...",1.0
4,‘Juice (Jews) are worse that nukes.’ Caption t...,Anti-semitism is a serious problem that we nee...,1.0
...,...,...,...
95,Pakistan: 202 Million Nigeria: 193 Million Ban...,These groups tend to be a minority in Britian....,1.0
96,"Took this in Wanstead, east London. An area wi...",There are more than enough empty homes in this...,2.0
97,"Took this in Wanstead, east London. An area wi...","Is it due to immigration, or the lack of affor...",1.0
98,"Took this in Wanstead, east London. An area wi...",There are so many empty homes in our country t...,2.0


In [7]:
data_df.isnull().sum().sum()

0

In [8]:
# train_df['Aggressive'].value_counts()
# data_df['Relevance'].value_counts()

In [9]:
counterspeech = data_df['CounterSpeech'].values
hatespeech = data_df['HateSpeech'].values
# rel_labels = data_df['Relevance'].values
# print(len(rel_labels), len(counterspeech), len(hatespeech))

In [10]:
train_df, val_df, test_df = np.split(data_df.sample(frac=1), [int(.8*len(data_df)), int(.9 * len(data_df))])
print('Num examples train-val-test')
print(len(train_df), len(val_df), len(test_df))
train_df.to_csv('drive/MyDrive/BTP/train.csv')
val_df.to_csv('drive/MyDrive/BTP/val.csv')
test_df.to_csv('drive/MyDrive/BTP/test.csv')

from google.colab import files

# files.download('train.csv')
# files.download('val.csv')
# files.download('test.csv')

Num examples train-val-test
4894 612 612


In [11]:
@dataclass
class ModelArguments:
  """
  Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
  """

  model_name_or_path: str = field(
      metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
  )
  config_name: Optional[str] = field(
      default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
  )
  tokenizer_name: Optional[str] = field(
      default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
  )
  cache_dir: Optional[str] = field(
      default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
  )


@dataclass
class MultimodalDataTrainingArguments:
  """
  Arguments pertaining to how we combine tabular features
  Using `HfArgumentParser` we can turn this class
  into argparse arguments to be able to specify them on
  the command line.
  """

  data_path: str = field(metadata={
                            'help': 'the path to the csv file containing the dataset'
                        })
  column_info_path: str = field(
      default=None,
      metadata={
          'help': 'the path to the json file detailing which columns are text, categorical, numerical, and the label'
  })

  column_info: dict = field(
      default=None,
      metadata={
          'help': 'a dict referencing the text, categorical, numerical, and label columns'
                  'its keys are text_cols, num_cols, cat_cols, and label_col'
  })

  categorical_encode_type: str = field(default='ohe',
                                        metadata={
                                            'help': 'sklearn encoder to use for categorical data',
                                            'choices': ['ohe', 'binary', 'label', 'none']
                                        })
  numerical_transformer_method: str = field(default='yeo_johnson',
                                            metadata={
                                                'help': 'sklearn numerical transformer to preprocess numerical data',
                                                'choices': ['yeo_johnson', 'box_cox', 'quantile_normal', 'none']
                                            })
  task: str = field(default="classification",
                    metadata={
                        "help": "The downstream training task",
                        "choices": ["classification", "regression"]
                    })

  mlp_division: int = field(default=4,
                            metadata={
                                'help': 'the ratio of the number of '
                                        'hidden dims in a current layer to the next MLP layer'
                            })
  combine_feat_method: str = field(default='individual_mlps_on_cat_and_numerical_feats_then_concat',
                                    metadata={
                                        'help': 'method to combine categorical and numerical features, '
                                                'see README for all the method'
                                    })
  mlp_dropout: float = field(default=0.1,
                              metadata={
                                'help': 'dropout ratio used for MLP layers'
                              })
  numerical_bn: bool = field(default=True,
                              metadata={
                                  'help': 'whether to use batchnorm on numerical features'
                              })
  use_simple_classifier: str = field(default=True,
                                      metadata={
                                          'help': 'whether to use single layer or MLP as final classifier'
                                      })
  mlp_act: str = field(default='relu',
                        metadata={
                            'help': 'the activation function to use for finetuning layers',
                            'choices': ['relu', 'prelu', 'sigmoid', 'tanh', 'linear']
                        })
  gating_beta: float = field(default=0.2,
                              metadata={
                                  'help': "the beta hyperparameters used for gating tabular data "
                                          "see https://www.aclweb.org/anthology/2020.acl-main.214.pdf"
                              })

  def __post_init__(self):
      assert self.column_info != self.column_info_path
      if self.column_info is None and self.column_info_path:
          with open(self.column_info_path, 'r') as f:
              self.column_info = json.load(f)

In [12]:
text_cols = ['HateSpeech', 'CounterSpeech']
# cat_cols = ['Clothing ID', 'Division Name', 'Department Name', 'Class Name']
# numerical_cols = ['Relevance']

column_info_dict = {
    'text_cols': text_cols,
    'label_col': 'Length',
    'label_list': [0.0,1.0,2.0]
}


model_args = ModelArguments(
    model_name_or_path='bert-base-uncased'
)

data_args = MultimodalDataTrainingArguments(
    data_path='drive/MyDrive/BTP/',
    column_info=column_info_dict,
    task='classification'
)

training_args = TrainingArguments(
    output_dir="drive/MyDrive/BTP/logs/model_name",
    logging_dir="drive/MyDrive/BTP/logs/runs",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    per_device_train_batch_size=32,
    num_train_epochs=5,
    evaluation_strategy = "steps",
    logging_steps=25,
)

set_seed(training_args.seed)

In [13]:
tokenizer_path_or_name = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
print('Specified tokenizer: ', tokenizer_path_or_name)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_path_or_name,
    cache_dir=model_args.cache_dir,
)

Specified tokenizer:  bert-base-uncased


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
data_df.columns

Index(['HateSpeech', 'CounterSpeech', 'Length'], dtype='object')

In [15]:
# Get Datasets
train_dataset, val_dataset, test_dataset = load_data_from_folder(
    folder_path = data_args.data_path,
    text_cols = data_args.column_info['text_cols'],
    tokenizer = tokenizer,
    label_col=data_args.column_info['label_col'],
    label_list=data_args.column_info['label_list'],
    categorical_cols=None,
    numerical_cols=None,
    categorical_encode_type=None,
    numerical_transformer_method='none',
    sep_text_token_str=tokenizer.sep_token,
)

In [16]:
num_labels = len(np.unique(train_dataset.labels))
num_labels

5

In [17]:
config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
tabular_config = TabularConfig(num_labels=num_labels,
                               **vars(data_args))
config.tabular_config = tabular_config

In [18]:
model = AutoModelWithTabular.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir
    )

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertWithTabular: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertWithTabular from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertWithTabular from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertWithTabular were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['tabular_

In [19]:
import numpy as np
from scipy.special import softmax
from sklearn.metrics import (
    auc,
    precision_recall_curve,
    roc_auc_score,
    f1_score,
    confusion_matrix,
    matthews_corrcoef,
)

def calc_classification_metrics(p: EvalPrediction):
  # print(p.predictions[0])
  # print(len(p.predictions[0]))
  # print('-0-------------------')
  # print(p.predictions[1], type(p.predictions[0]), type(p.predictions[1]), len(p.predictions), type(p.predictions))

  # for i in range(len(p.predictions)):
  #   p.predictions[i] = torch.from_numpy(np_array)
  # pred2 = torch.tensor(p.predictions)
  # pred2 = [torch.from_numpy(item).float() for item in p.predictions]
  pred_labels = np.argmax(p.predictions[0], axis=1)
  # print(pred_labels)
  pred_scores = softmax(p.predictions[0], axis=1)
  print(pred_scores)
  labels = p.label_ids
  labels = [int(x) for x in labels]
  # print('p.label_ids \n', p.label_ids)
  # if len(np.unique(labels)) == 5: 
  #     # roc_auc_pred_score = roc_auc_score(labels, pred_scores, multi_class='ovo')
  #     precisions, recalls, thresholds = precision_recall_curve(labels,
  #                                                               pred_scores)
  #     fscore = (2 * precisions * recalls) / (precisions + recalls)
  #     fscore[np.isnan(fscore)] = 0
  #     ix = np.argmax(fscore)
  #     threshold = thresholds[ix].item()
  #     # pr_auc = auc(recalls, precisions)
  #     tn, fp, fn, tp = confusion_matrix(labels, pred_labels, labels=[0.0, 1.0, 2.0, 3.0, 4.0]).ravel()
  #     result = {
  #         'threshold': threshold,
  #               'recall': recalls[ix].item(),
  #               'precision': precisions[ix].item(), 'f1': fscore[ix].item(),
  #               'tn': tn.item(), 'fp': fp.item(), 'fn': fn.item(), 'tp': tp.item()
  #               }
  # else:
  acc = (pred_labels == labels).mean()
  f1 = f1_score(y_true=labels, y_pred=pred_labels, average="macro")
  result = {
      "acc": acc,
      "f1": f1,
      "acc_and_f1": (acc + f1) / 2,
      "mcc": matthews_corrcoef(labels, pred_labels)
  }

  return result

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=calc_classification_metrics,
)

In [21]:
%%time
trainer.train()



Step,Training Loss,Validation Loss,Acc,F1,Acc And F1,Mcc
25,0.774,0.619308,0.779412,0.30398,0.541696,0.077203


[[0.15136507 0.80955225 0.02275084 0.00730105 0.00903071]
 [0.12684508 0.8476517  0.01617693 0.00398277 0.00534335]
 [0.1043324  0.86896044 0.01681679 0.00408836 0.00580192]
 ...
 [0.27916595 0.63994724 0.03681823 0.02371411 0.02035454]
 [0.25706556 0.6788743  0.03049039 0.01722822 0.01634142]
 [0.07171448 0.8949645  0.02167799 0.00527036 0.00637266]]


KeyboardInterrupt: ignored

In [22]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [23]:
%tensorboard --logdir ./logs/runs --port=6006

ERROR: Failed to launch TensorBoard (exited with 1).
Contents of stderr:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/tensorboard/compat/__init__.py", line 42, in tf
    from tensorboard.compat import notf  # noqa: F401
ImportError: cannot import name 'notf' from 'tensorboard.compat' (/usr/local/lib/python3.9/dist-packages/tensorboard/compat/__init__.py)

During handling of the above exception, another exception occurred:

RuntimeError: module compiled against API version 0xf but this version of numpy is 0xe
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/tensorboard/compat/__init__.py", line 42, in tf
    from tensorboard.compat import notf  # noqa: F401
ImportError: cannot import name 'notf' from 'tensorboard.compat' (/usr/local/lib/python3.9/dist-packages/tensorboard/compat/__init__.py)

During handling of the above exception, another exception occurred:

RuntimeError: module compiled against API version 0xf but 