In [None]:
!pip install multimodal-transformers
!pip install --upgrade accelerate
!pip install transformers accelerate

In [2]:
!git clone https://huggingface.co/roberta-base

Cloning into 'roberta-base'...
remote: Enumerating objects: 81, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 81 (delta 0), reused 0 (delta 0), pack-reused 78[K
Unpacking objects: 100% (81/81), 1.63 MiB | 6.12 MiB/s, done.
Filtering content: 100% (5/5), 2.61 GiB | 129.53 MiB/s, done.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from dataclasses import dataclass, field
import logging
import os
from typing import Optional

import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoConfig,
    Trainer,
    EvalPrediction,
    set_seed
)

import sklearn.metrics as metrics
from sklearn import metrics
from math import sqrt

from transformers.training_args import TrainingArguments

from multimodal_transformers.data import load_data_from_folder
from multimodal_transformers.model import TabularConfig
from multimodal_transformers.model import RobertaWithTabular
from sklearn.preprocessing import LabelEncoder
import sklearn.metrics as metrics
from sklearn import metrics
from math import sqrt

logging.basicConfig(level=logging.INFO)
os.environ['COMET_MODE'] = 'DISABLED'

In [5]:
DATA_PATH = '/content/drive/MyDrive/Bert_complex/corpus/Complex.xlsx'
#DATA_PATH = 'Prueba.tsv'

In [6]:
df = pd.read_excel(DATA_PATH)

In [7]:
df['sentence'] = df['sentence'].str.replace('"', '').str.replace("'", '')

In [8]:
df['sentence_token'] = df.apply(
      lambda x: str(x['sentence']).lower()+ ' </s> ' + str(x['token']).lower(),
      axis=1)

In [9]:
train_df, val_df, test_df = np.split(df.sample(frac=1), [int(.8*len(df)), int(.9 * len(df))])
print('Num ejemplos train-val-test')
print(len(train_df), len(val_df), len(test_df))
train_df.to_csv('train.csv')
val_df.to_csv('val.csv')
test_df.to_csv('test.csv')

Num ejemplos train-val-test
5971 746 747


In [10]:
@dataclass
class MultimodalDataTrainingArguments:
  """
  Arguments pertaining to how we combine tabular features
  Using `HfArgumentParser` we can turn this class
  into argparse arguments to be able to specify them on
  the command line.
  """

  data_path: str = field(metadata={
                            'help': 'the path to the csv file containing the dataset'
                        })
  column_info_path: str = field(
      default=None,
      metadata={
          'help': 'the path to the json file detailing which columns are text, categorical, numerical, and the label'
  })

  column_info: dict = field(
      default=None,
      metadata={
          'help': 'a dict referencing the text, categorical, numerical, and label columns'
                  'its keys are text_cols, num_cols, cat_cols, and label_col'
  })

  categorical_encode_type: str = field(default='none',
                                        metadata={
                                            'help': 'sklearn encoder to use for categorical data',
                                            'choices': ['ohe', 'binary', 'label', 'none']
                                        })
  numerical_transformer_method: str = field(default='yeo_johnson',
                                            metadata={
                                                'help': 'sklearn numerical transformer to preprocess numerical data',
                                                'choices': ['yeo_johnson', 'box_cox', 'quantile_normal', 'none']
                                            })
  task: str = field(default="regression",
                    metadata={
                        "help": "The downstream training task",
                        "choices": ["classification", "regression"]
                    })

  mlp_division: int = field(default=4,
                            metadata={
                                'help': 'the ratio of the number of '
                                        'hidden dims in a current layer to the next MLP layer'
                            })
  combine_feat_method: str = field(default='individual_mlps_on_cat_and_numerical_feats_then_concat',
                                    metadata={
                                        'help': 'method to combine categorical and numerical features, '
                                                'see README for all the method'
                                    })
  mlp_dropout: float = field(default=0.1,
                              metadata={
                                'help': 'dropout ratio used for MLP layers'
                              })
  numerical_bn: bool = field(default=True,
                              metadata={
                                  'help': 'whether to use batchnorm on numerical features'
                              })
  use_simple_classifier: str = field(default=False,
                                      metadata={
                                          'help': 'whether to use single layer or MLP as final classifier'
                                      })
  mlp_act: str = field(default='relu',
                        metadata={
                            'help': 'the activation function to use for finetuning layers',
                            'choices': ['relu', 'prelu', 'sigmoid', 'tanh', 'linear']
                        })
  gating_beta: float = field(default=0.2,
                              metadata={
                                  'help': "the beta hyperparameters used for gating tabular data "
                                          "see https://www.aclweb.org/anthology/2020.acl-main.214.pdf"
                              })

In [11]:
model_name = 'roberta-base'

column_info_dict = {
    'text_cols': ['corpus','sentence_token'],
    'num_cols': ['abs_frecuency','rel_frecuency','length','number_syllables','token_possition','number_token_sentences','number_synonyms',
                 'number_hyponyms','number_hypernyms','Part_of_speech','freq_relative_word_before','freq_relative_word_after','len_word_before',
                 'len_word_after','mtld_diversity','propn','aux','verb','adp','noun','nn','sym','num'],
    'label_col': ['complexity']
}

data_args = MultimodalDataTrainingArguments(
    data_path='.',
    # combine_feat_method='text_only',
    # combine_feat_method='concat',
    # combine_feat_method='individual_mlps_on_cat_and_numerical_feats_then_concat',
    combine_feat_method='attention_on_cat_and_numerical_feats',
    #combine_feat_method='gating_on_cat_and_num_feats_then_sum',
    #combine_feat_method='weighted_feature_sum_on_transformer_cat_and_numerical_feats',
    column_info=column_info_dict,
    task='regression',
)

training_args = TrainingArguments(
    output_dir="./logs/model_name",
    logging_dir="./logs/runs",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=32,
    num_train_epochs=100,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    logging_steps=16,
    eval_steps=5
)

set_seed(training_args.seed)

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
print('Specified tokenizer: ', model_name)

Specified tokenizer:  roberta-base


In [13]:
# Get Datasets
train_dataset, val_dataset, test_dataset = load_data_from_folder(
    data_args.data_path,
    data_args.column_info['text_cols'],
    tokenizer,
    label_col=data_args.column_info['label_col'],
    label_list = None,
    categorical_cols = None,
    numerical_transformer_method = 'yeo_johnson',
    numerical_cols=data_args.column_info['num_cols'],
    sep_text_token_str=' </s> ',
    categorical_encode_type = None
)

  loglike = -n_samples / 2 * np.log(x_trans.var())


In [14]:
config = AutoConfig.from_pretrained(model_name)
tabular_config = TabularConfig(num_labels=1,
                               #cat_feat_dim=train_dataset.cat_feats.shape[1],
                               numerical_feat_dim=train_dataset.numerical_feats.shape[1],
                               **vars(data_args))
config.tabular_config = tabular_config

In [15]:
model = RobertaWithTabular.from_pretrained(
        model_name,
        config=config
    )

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaWithTabular: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaWithTabular from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaWithTabular from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaWithTabular were not initialized from the model checkpoint at roberta-base and are newly initialized: ['tabular_classifier.bn.1.running_mean', 'classifier.out_proj.weight', 'tabular_classifier.layers.1.weight', 'tabular_combiner.num_bn.weight', 'tabular_classifier.bn.2.running_mean

In [16]:
def calc_regression_metrics(p: EvalPrediction):
    predictions = p.predictions[0]
    preds = np.squeeze(predictions)
    labels = np.squeeze(p.label_ids)
    mse = metrics.mean_squared_error(labels, preds)
    rmse = sqrt(mse)
    mae = metrics.mean_absolute_error(labels, preds)
    return {
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        'R2': metrics.r2_score(labels, preds)
    }

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=calc_regression_metrics
)

In [18]:
data_args.combine_feat_method

'attention_on_cat_and_numerical_feats'

In [19]:
%%time
trainer.train()



Epoch,Training Loss,Validation Loss,Mae,Mse,Rmse,R2
1,0.0942,0.088537,0.269366,0.088537,0.297551,-4.006981
2,0.0809,0.07582,0.246546,0.07582,0.275355,-3.287842
3,0.07,0.063517,0.222328,0.063517,0.252027,-2.592085
4,0.0606,0.07471,0.245045,0.07471,0.273331,-3.225044
5,0.0519,0.083382,0.257076,0.083382,0.28876,-3.71549
6,0.0433,0.038349,0.168737,0.038349,0.195829,-1.16873
7,0.0348,0.024551,0.133139,0.024551,0.156688,-0.388434
8,0.0298,0.02883,0.146916,0.02883,0.169795,-0.630442
9,0.0247,0.023234,0.128556,0.023234,0.152427,-0.313942
10,0.0204,0.015762,0.103506,0.015762,0.125548,0.108596


CPU times: user 2h 34min 56s, sys: 56.3 s, total: 2h 35min 53s
Wall time: 2h 35min 28s


TrainOutput(global_step=18700, training_loss=0.010169368800632456, metrics={'train_runtime': 9328.5439, 'train_samples_per_second': 64.008, 'train_steps_per_second': 2.005, 'total_flos': 1.1548062831115843e+17, 'train_loss': 0.010169368800632456, 'epoch': 100.0})

In [20]:
OUTPUT_PATH = '/content/drive/MyDrive/Bert_complex/results/' + model_name.split('/')[-1] + '-Multimodal-' + data_args.combine_feat_method.split('/')[-1]

In [21]:
trainer.save_model(OUTPUT_PATH)

In [22]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.008763696067035198,
 'eval_MAE': 0.07236076388233618,
 'eval_MSE': 0.008763696872535316,
 'eval_RMSE': 0.09361461890396881,
 'eval_R2': 0.5238264590790844,
 'eval_runtime': 4.3119,
 'eval_samples_per_second': 173.241,
 'eval_steps_per_second': 21.8,
 'epoch': 100.0}