In [None]:
!pip install multimodal-transformers
!pip install --upgrade accelerate
!pip install transformers accelerate

In [None]:
!pip list

In [2]:
!git clone https://huggingface.co/xlm-roberta-large

Cloning into 'xlm-roberta-large'...
remote: Enumerating objects: 50, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 50 (delta 2), reused 0 (delta 0), pack-reused 40[K
Unpacking objects: 100% (50/50), 5.86 MiB | 3.50 MiB/s, done.
Filtering content: 100% (6/6), 11.39 GiB | 153.61 MiB/s, done.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from dataclasses import dataclass, field
import logging
import os
from typing import Optional

import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoConfig,
    Trainer,
    EvalPrediction,
    set_seed
)

import sklearn.metrics as metrics
from sklearn import metrics
from math import sqrt


# from scipy.special import softmax
# from sklearn.metrics import (
#     f1_score,
#     matthews_corrcoef
# )

from transformers.training_args import TrainingArguments

from multimodal_transformers.data import load_data_from_folder
from multimodal_transformers.model import TabularConfig
from multimodal_transformers.model import RobertaWithTabular
from sklearn.preprocessing import LabelEncoder
import sklearn.metrics as metrics
from sklearn import metrics
from math import sqrt

logging.basicConfig(level=logging.INFO)
os.environ['COMET_MODE'] = 'DISABLED'

In [5]:
DATA_PATH = '/content/drive/MyDrive/CompLex - Multimodal/corpus/Complex.xlsx'
# DATA_PATH = "corpus/Complex.xlsx"

In [6]:
df = pd.read_excel(DATA_PATH)
#df

In [None]:
df

In [None]:
#label_encoder = LabelEncoder()
#id2label = list(set(df['complexity'].to_numpy()))
#df['complexity'] = label_encoder.fit_transform(df['complexity'])

In [7]:
df['sentence'] = df['sentence'].str.replace('"', '').str.replace("'", '')

In [8]:
df['sentence_token'] = df.apply(
      lambda x: str(x['sentence']).lower()+ ' </s> ' + str(x['token']).lower(),
      axis=1)

In [None]:
df['sentence_token'].values

In [None]:
df

In [10]:
train_df, val_df, test_df = np.split(df.sample(frac=1), [int(.8*len(df)), int(.9 * len(df))])
print('Num ejemplos train-val-test')
print(len(train_df), len(val_df), len(test_df))
train_df.to_csv('train.csv')
val_df.to_csv('val.csv')
test_df.to_csv('test.csv')

Num ejemplos train-val-test
5971 746 747


In [None]:
train_df

In [None]:
#id2label

In [11]:
@dataclass
class MultimodalDataTrainingArguments:
  """
  Arguments pertaining to how we combine tabular features
  Using `HfArgumentParser` we can turn this class
  into argparse arguments to be able to specify them on
  the command line.
  """

  data_path: str = field(metadata={
                            'help': 'the path to the csv file containing the dataset'
                        })
  column_info_path: str = field(
      default=None,
      metadata={
          'help': 'the path to the json file detailing which columns are text, categorical, numerical, and the label'
  })

  column_info: dict = field(
      default=None,
      metadata={
          'help': 'a dict referencing the text, categorical, numerical, and label columns'
                  'its keys are text_cols, num_cols, cat_cols, and label_col'
  })

  categorical_encode_type: str = field(default='none',
                                        metadata={
                                            'help': 'sklearn encoder to use for categorical data',
                                            'choices': ['ohe', 'binary', 'label', 'none']
                                        })
  numerical_transformer_method: str = field(default='yeo_johnson',
                                            metadata={
                                                'help': 'sklearn numerical transformer to preprocess numerical data',
                                                'choices': ['yeo_johnson', 'box_cox', 'quantile_normal', 'none']
                                            })
  task: str = field(default="regression",
                    metadata={
                        "help": "The downstream training task",
                        "choices": ["classification", "regression"]
                    })

  mlp_division: int = field(default=4,
                            metadata={
                                'help': 'the ratio of the number of '
                                        'hidden dims in a current layer to the next MLP layer'
                            })
  combine_feat_method: str = field(default='individual_mlps_on_cat_and_numerical_feats_then_concat',
                                    metadata={
                                        'help': 'method to combine categorical and numerical features, '
                                                'see README for all the method'
                                    })
  mlp_dropout: float = field(default=0.1,
                              metadata={
                                'help': 'dropout ratio used for MLP layers'
                              })
  numerical_bn: bool = field(default=True,
                              metadata={
                                  'help': 'whether to use batchnorm on numerical features'
                              })
  use_simple_classifier: str = field(default=True,
                                      metadata={
                                          'help': 'whether to use single layer or MLP as final classifier'
                                      })
  mlp_act: str = field(default='relu',
                        metadata={
                            'help': 'the activation function to use for finetuning layers',
                            'choices': ['relu', 'prelu', 'sigmoid', 'tanh', 'linear']
                        })
  gating_beta: float = field(default=0.2,
                              metadata={
                                  'help': "the beta hyperparameters used for gating tabular data "
                                          "see https://www.aclweb.org/anthology/2020.acl-main.214.pdf"
                              })

In [12]:
model_name = 'xlm-roberta-large'

column_info_dict = {
    'text_cols': ['corpus','sentence_token'],
    'num_cols': ['abs_frecuency','rel_frecuency','length','number_syllables','token_possition','number_token_sentences','number_synonyms','number_hyponyms','number_hypernyms','Part_of_speech','freq_relative_word_before','freq_relative_word_after','len_word_before','len_word_after','mtld_diversity','propn','aux','verb','adp','noun','nn','sym','num'],
    'label_col': ['complexity']
}

data_args = MultimodalDataTrainingArguments(
    data_path='.',
    combine_feat_method='attention_on_cat_and_numerical_feats',
    column_info=column_info_dict,
    task='regression',
)

training_args = TrainingArguments(
    output_dir="./logs/model_name",
    logging_dir="./logs/runs",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=32,
    num_train_epochs=50,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    logging_steps=16,
    eval_steps=5
)

set_seed(training_args.seed)

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
print('Specified tokenizer: ', model_name)

Specified tokenizer:  xlm-roberta-large


In [14]:
# Get Datasets
train_dataset, val_dataset, test_dataset = load_data_from_folder(
    data_args.data_path,
    data_args.column_info['text_cols'],
    tokenizer,
    label_col=data_args.column_info['label_col'],
    label_list = None,
    categorical_cols = None,
    numerical_transformer_method = 'yeo_johnson',
    numerical_cols=data_args.column_info['num_cols'],
    sep_text_token_str='</s>',
    categorical_encode_type = None
)

  loglike = -n_samples / 2 * np.log(x_trans.var())


In [None]:
# num_labels = len(np.unique(train_dataset.labels))
# num_labels

In [15]:
config = AutoConfig.from_pretrained(model_name)
tabular_config = TabularConfig(num_labels=1,
                               #cat_feat_dim=train_dataset.cat_feats.shape[1],
                               numerical_feat_dim=train_dataset.numerical_feats.shape[1],
                               **vars(data_args))
config.tabular_config = tabular_config

In [16]:
model = RobertaWithTabular.from_pretrained(
        model_name,
        config=config
    )

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing RobertaWithTabular: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaWithTabular from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaWithTabular from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaWithTabular were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.weight', 'tabular_combiner.bias', 'tabular_combiner.num_bn.bias', 'tabular_combiner.weight_

In [17]:
data_args.combine_feat_method

'attention_on_cat_and_numerical_feats'

In [None]:
# def calc_classification_metrics(p: EvalPrediction):
#   pred_labels = np.argmax(p.predictions[0], axis=1)
#   pred_scores = softmax(p.predictions[0], axis=1)[:, 1]
#   labels = p.label_ids

#   acc = (pred_labels == labels).mean()
#   f1 = f1_score(y_true=labels, y_pred=pred_labels, average='micro')
#   result = {
#       "acc": acc,
#       "f1": f1,
#       "acc_and_f1": (acc + f1) / 2,
#       "mcc": matthews_corrcoef(labels, pred_labels)
#   }

#   return result

In [18]:
def calc_regression_metrics(p: EvalPrediction):
    predictions = p.predictions[0]
    preds = np.squeeze(predictions)
    labels = np.squeeze(p.label_ids)
    mse = metrics.mean_squared_error(labels, preds)
    rmse = sqrt(mse)
    mae = metrics.mean_absolute_error(labels, preds)
    return {
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        'R2': metrics.r2_score(labels, preds)
    }

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=calc_regression_metrics
)

In [20]:
%%time
trainer.train()



Epoch,Training Loss,Validation Loss,Mae,Mse,Rmse,R2
1,0.0264,0.015991,0.099128,0.015991,0.126455,0.137177
2,0.0182,0.015002,0.094911,0.015002,0.122482,0.190543
3,0.0158,0.016349,0.093753,0.016349,0.127862,0.117867
4,0.0157,0.015855,0.099691,0.015855,0.125915,0.144523
5,0.0161,0.015122,0.092614,0.015122,0.122973,0.184041
6,0.0163,0.014934,0.092063,0.014933,0.122203,0.194226
7,0.0156,0.016842,0.104608,0.016842,0.129776,0.091257
8,0.015,0.014853,0.092389,0.014853,0.121874,0.198556
9,0.0153,0.018301,0.111409,0.018301,0.13528,0.012545
10,0.0151,0.014717,0.092332,0.014717,0.121315,0.205895


CPU times: user 2h 49min 30s, sys: 49min 46s, total: 3h 39min 17s
Wall time: 3h 40min 54s


TrainOutput(global_step=9350, training_loss=0.014941449165344239, metrics={'train_runtime': 13254.6252, 'train_samples_per_second': 22.524, 'train_steps_per_second': 0.705, 'total_flos': 1.7614671513173072e+17, 'train_loss': 0.014941449165344239, 'epoch': 50.0})

In [21]:
OUTPUT_PATH = '/content/drive/MyDrive/CompLex - Multimodal/results/' + model_name.split('/')[-1] + '-Multimodal-' + data_args.combine_feat_method.split('/')[-1]
trainer.save_model(OUTPUT_PATH)

**EVALUAMOS EL MODELO**

In [22]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.014011547900736332,
 'eval_MAE': 0.0909172170920975,
 'eval_MSE': 0.01401154776059342,
 'eval_RMSE': 0.1183703837984545,
 'eval_R2': 0.2319131179651306,
 'eval_runtime': 5.3818,
 'eval_samples_per_second': 138.801,
 'eval_steps_per_second': 17.466,
 'epoch': 50.0}

**Generamos Predicciones**

In [None]:
# predictions = trainer.predict(test_dataset)

# # Obtener las predicciones y las etiquetas reales del objeto de predicción
# pred_labels = predictions.predictions[0]
# true_labels = predictions.label_ids

# def calc_regression_metrics(predictions, labels):
#     preds = np.squeeze(predictions)
#     labels = np.squeeze(labels)
#     mse = metrics.mean_squared_error(labels, preds)
#     rmse = sqrt(mse)
#     mae = metrics.mean_absolute_error(labels, preds)
#     return {
#         "mse": mse,
#         "rmse": rmse,
#         "mae": mae,
#     }

# # Evaluar el rendimiento del modelo en las predicciones
# evaluation = calc_regression_metrics(pred_labels, true_labels)

# # Imprimir los resultados de la evaluación
# print("MSE:", evaluation["mse"])
# print("RMSE:", evaluation["rmse"])
# print("MAE:", evaluation["mae"])

MSE: 0.04903765554811612
RMSE: 0.22144447509052043
MAE: 0.18595963293479548
