In [1]:
pip install multimodal-transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from dataclasses import dataclass, field
import json
import logging
import os
from typing import Optional

import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoConfig,
    Trainer,
    EvalPrediction,
    set_seed
)
from transformers.training_args import TrainingArguments

from multimodal_transformers.data import load_data_from_folder
from multimodal_transformers.model import TabularConfig
from multimodal_transformers.model import AutoModelWithTabular

logging.basicConfig(level=logging.INFO)
os.environ['COMET_MODE'] = 'DISABLED'

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch

if torch.cuda.is_available():    
    print("GPU Available, using GPU \n")
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

GPU Available, using GPU 



In [4]:
import pandas as pd

dataset_dir_path = 'intentconan.csv'
columns = ["CounterSpeech", "Intent", "Length"]
data_df = pd.read_csv(dataset_dir_path, usecols=columns)
# data_df['Aggressive'] = data_df['Aggressive'].apply(lambda x: x-1)
# data_df['Relevance'] = data_df['Relevance'].apply(lambda x: x-1)
# data_df['Complexity'] = data_df['Complexity'].apply(lambda x: x-1)
data_df['Length'] = data_df['Length'].apply(lambda x: x-1)
# data_df['Suggest'] = data_df['Suggest'].apply(lambda x: x-1)
# data_df.drop(columns=["Annotator"], inplace=True)
data_df.dropna(inplace=True)

data_df.head(100)

Unnamed: 0,Intent,CounterSpeech,Length
0,Informative,The US is the second most polluting country in...,1.0
1,Questioning,Doesn't everyone on the planet have a responsi...,1.0
2,Denouncing,The world would be a better place if people we...,0.0
3,Positive,"You're right, ocean pollution is one of the ma...",1.0
4,Informative,Anti-semitism is a serious problem that we nee...,1.0
...,...,...,...
95,Positive,These groups tend to be a minority in Britian....,1.0
96,Informative,There are more than enough empty homes in this...,2.0
97,Questioning,"Is it due to immigration, or the lack of affor...",1.0
98,Denouncing,There are so many empty homes in our country t...,2.0


In [5]:
data_df.isnull().sum().sum()

0

In [6]:
# train_df['Aggressive'].value_counts()
# data_df['Relevance'].value_counts()

In [7]:
counterspeech = data_df['CounterSpeech'].values
# hatespeech = data_df['HateSpeech'].values
# rel_labels = data_df['Relevance'].values
# print(len(rel_labels), len(counterspeech), len(hatespeech))

In [8]:
train_df, val_df, test_df = np.split(data_df.sample(frac=1), [int(.8*len(data_df)), int(.9*len(data_df))])
print('Num examples train-val-test')
print(len(train_df), len(val_df), len(test_df))
train_df.to_csv('train.csv')
val_df.to_csv('val.csv')
test_df.to_csv('test.csv')

# from google.colab import files

# files.download('train.csv')
# files.download('val.csv')
# files.download('test.csv')

Num examples train-val-test
4894 612 612


In [9]:
@dataclass
class ModelArguments:
  """
  Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
  """

  model_name_or_path: str = field(
      metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
  )
  config_name: Optional[str] = field(
      default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
  )
  tokenizer_name: Optional[str] = field(
      default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
  )
  cache_dir: Optional[str] = field(
      default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
  )


@dataclass
class MultimodalDataTrainingArguments:
  """
  Arguments pertaining to how we combine tabular features
  Using `HfArgumentParser` we can turn this class
  into argparse arguments to be able to specify them on
  the command line.
  """

  data_path: str = field(metadata={
                            'help': 'the path to the csv file containing the dataset'
                        })
  column_info_path: str = field(
      default=None,
      metadata={
          'help': 'the path to the json file detailing which columns are text, categorical, numerical, and the label'
  })

  column_info: dict = field(
      default=None,
      metadata={
          'help': 'a dict referencing the text, categorical, numerical, and label columns'
                  'its keys are text_cols, num_cols, cat_cols, and label_col'
  })

  categorical_encode_type: str = field(default='ohe',
                                        metadata={
                                            'help': 'sklearn encoder to use for categorical data',
                                            'choices': ['ohe', 'binary', 'label', 'none']
                                        })
  numerical_transformer_method: str = field(default='yeo_johnson',
                                            metadata={
                                                'help': 'sklearn numerical transformer to preprocess numerical data',
                                                'choices': ['yeo_johnson', 'box_cox', 'quantile_normal', 'none']
                                            })
  task: str = field(default="classification",
                    metadata={
                        "help": "The downstream training task",
                        "choices": ["classification", "regression"]
                    })

  mlp_division: int = field(default=4,
                            metadata={
                                'help': 'the ratio of the number of '
                                        'hidden dims in a current layer to the next MLP layer'
                            })
  combine_feat_method: str = field(default='individual_mlps_on_cat_and_numerical_feats_then_concat',
                                    metadata={
                                        'help': 'method to combine categorical and numerical features, '
                                                'see README for all the method'
                                    })
  mlp_dropout: float = field(default=0.1,
                              metadata={
                                'help': 'dropout ratio used for MLP layers'
                              })
  numerical_bn: bool = field(default=True,
                              metadata={
                                  'help': 'whether to use batchnorm on numerical features'
                              })
  use_simple_classifier: str = field(default=True,
                                      metadata={
                                          'help': 'whether to use single layer or MLP as final classifier'
                                      })
  mlp_act: str = field(default='relu',
                        metadata={
                            'help': 'the activation function to use for finetuning layers',
                            'choices': ['relu', 'prelu', 'sigmoid', 'tanh', 'linear']
                        })
  gating_beta: float = field(default=0.2,
                              metadata={
                                  'help': "the beta hyperparameters used for gating tabular data "
                                          "see https://www.aclweb.org/anthology/2020.acl-main.214.pdf"
                              })

  def __post_init__(self):
      assert self.column_info != self.column_info_path
      if self.column_info is None and self.column_info_path:
          with open(self.column_info_path, 'r') as f:
              self.column_info = json.load(f)

In [10]:
text_cols = ['CounterSpeech','Intent']
# cat_cols = ['Clothing ID', 'Division Name', 'Department Name', 'Class Name']
# numerical_cols = ['Relevance', 'Aggresive', 'Complexity', 'Length']

column_info_dict = {
    'text_cols': text_cols,
    # 'num_cols': numerical_cols,
    'label_col': 'Length',
    'label_list': [0.0,1.0,2.0]
}


model_args = ModelArguments(
    model_name_or_path='bert-base-uncased'
)

data_args = MultimodalDataTrainingArguments(
    data_path='.',
    column_info=column_info_dict,
    task='classification'
)

training_args = TrainingArguments(
    output_dir="content/codes/logs/model_name",
    logging_dir="content/codes/logs/runs",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    per_device_train_batch_size=32,
    num_train_epochs=5,
    evaluation_strategy = "steps",
    logging_steps=25,
)

set_seed(training_args.seed)

In [11]:
tokenizer_path_or_name = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
print('Specified tokenizer: ', tokenizer_path_or_name)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_path_or_name,
    cache_dir=model_args.cache_dir,
)

Specified tokenizer:  bert-base-uncased


In [12]:
data_df.columns

Index(['Intent', 'CounterSpeech', 'Length'], dtype='object')

In [13]:
# Get Datasets
train_dataset, val_dataset, test_dataset = load_data_from_folder(
    folder_path = data_args.data_path,
    text_cols = data_args.column_info['text_cols'],
    tokenizer = tokenizer,
    label_col=data_args.column_info['label_col'],
    label_list=data_args.column_info['label_list'],
    categorical_cols=None,
    numerical_cols=None,
    categorical_encode_type=None,
    numerical_transformer_method='none',
    sep_text_token_str=tokenizer.sep_token,
)

INFO:multimodal_transformers.data.data_utils:0 categorical columns
INFO:multimodal_transformers.data.data_utils:0 numerical columns
INFO:multimodal_transformers.data.load_data:Text columns: ['Intent', 'CounterSpeech']
INFO:multimodal_transformers.data.load_data:Raw text example: Questioning [SEP] Would you rather live under the fear of your beliefs or in a real democracy?
INFO:multimodal_transformers.data.data_utils:0 categorical columns
INFO:multimodal_transformers.data.data_utils:0 numerical columns
INFO:multimodal_transformers.data.load_data:Text columns: ['Intent', 'CounterSpeech']
INFO:multimodal_transformers.data.load_data:Raw text example: Informative [SEP] Migrants are humans, and are entitled to the same rights as others, such as to ask for hospitality.
INFO:multimodal_transformers.data.data_utils:0 categorical columns
INFO:multimodal_transformers.data.data_utils:0 numerical columns
INFO:multimodal_transformers.data.load_data:Text columns: ['Intent', 'CounterSpeech']
INFO:mult

In [14]:
num_labels = len(np.unique(train_dataset.labels))
num_labels

4

In [15]:
config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
tabular_config = TabularConfig(num_labels=num_labels,
                            #    numerical_feat_dim=train_dataset.numerical_feats.shape[1],
                               **vars(data_args))
config.tabular_config = tabular_config

In [16]:
model = AutoModelWithTabular.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir
    )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertWithTabular: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertWithTabular from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertWithTabular from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertWithTabular were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['tabular_

In [17]:
import numpy as np
from scipy.special import softmax
from scipy import stats
from sklearn.metrics import (
    auc,
    precision_recall_curve,
    roc_auc_score,
    f1_score,
    confusion_matrix,
    matthews_corrcoef,
)

def calc_classification_metrics(p: EvalPrediction):
  pred_labels = np.argmax(p.predictions[0], axis=1)
  # print(pred_labels)
  pred_scores = softmax(p.predictions[0], axis=1)
  print(pred_scores)
  labels = p.label_ids
  labels = [int(x) for x in labels]
  p_corel = stats.pearsonr(pred_labels, labels) 
  acc = (pred_labels == labels).mean()
  f1 = f1_score(y_true=labels, y_pred=pred_labels, average="macro")
  result = {
      "acc": acc,
      "f1": f1,
      "avg": (acc + f1) / 2,
      "mcc": matthews_corrcoef(labels, pred_labels),
      "Pearson correlation" : p_corel
  }

  return result

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=calc_classification_metrics,
)

In [19]:
%%time
trainer.train()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjapneet50[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss


../aten/src/ATen/native/cuda/Loss.cu:240: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [6,0,0] Assertion `t >= 0 && t < n_classes` failed.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
%tensorboard --logdir ./logs/runs --port=6006