https://huggingface.co/hackathon-pln-es/electricidad-small-discriminator-finetuned-clasificacion-comentarios-suicidas

https://huggingface.co/mrm8488/electricidad-small-discriminator

https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb

In [7]:
# Instalamos transformers
!pip install transformers

In [8]:
# Importamos las librerías necesarias
import numpy as np
import pandas as pd
import re
import torch
import transformers
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

from torch.utils.data import Dataset
from transformers import (ElectraForSequenceClassification, ElectraTokenizerFast,
                          InputFeatures, Trainer, TrainingArguments)

transformers.__version__
os.environ["WANDB_DISABLED"] = "true"

In [9]:
# Transfer Learning- Importamos el modelo ElectraForSequenceClassification 

model = ElectraForSequenceClassification.from_pretrained(
    "mrm8488/electricidad-small-discriminator", num_labels=2)

tokenizer = ElectraTokenizerFast.from_pretrained(
    "mrm8488/electricidad-small-discriminator", do_lower_case=True, model_max_length=512)       

In [10]:
#Almacenamos el fichero limpio en data
data = pd.read_csv("../input/depresi/limpio (1).csv")            
data = data.sample(50000)
data.head(10)
print(data.shape)
print(data.value_counts(data['depresion'], sort = True))

In [11]:
data = data[data['texto'].str.len() > 15]

In [12]:
data.head(10)

In [13]:
data['depresion'] = data['depresion'].replace({True: 1, False: 0})

In [14]:
data.head()

In [15]:
# Definimos X e y
X = data['texto']
y = data['depresion']


In [16]:
# Definimos la parte que va a ser el train y cuál utilizaremos para test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0,
                                                    stratify=y)

In [17]:
training_sentences = X_train.to_list()
validation_sentences = X_test.to_list()
training_labels = y_train.to_list()
validation_labels = y_test.to_list()

In [18]:
# Borramos X e y para liberar espacio en memoria
del X, y


In [19]:
class TrainerDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer

        # Tokenize the input
        self.tokenized_inputs = tokenizer(inputs, padding=True, truncation=True)   

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return InputFeatures(
            input_ids=self.tokenized_inputs['input_ids'][idx],
            token_type_ids=self.tokenized_inputs['token_type_ids'][idx],
            attention_mask=self.tokenized_inputs['attention_mask'][idx],
            label=self.targets[idx])

In [20]:
train_dataset = TrainerDataset(training_sentences,
                               training_labels, tokenizer)
eval_dataset = TrainerDataset(validation_sentences,
                              validation_labels, tokenizer)

In [21]:
# Establecemos la semilla para poder reproducirlo y customizamos el modelo
np.random.seed(123)
torch.manual_seed(123)

training_args = TrainingArguments(
    output_dir="/kaggle/working/model_electra",
    num_train_epochs=8,  # Entrenamos con 8 epoch
    learning_rate=0.00001, # y un lr de 0.00001
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=32,
    dataloader_drop_last=True,  # Make sure all batches are of equal size
)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


# Instantiate the Trainer class
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_dataset,
                  eval_dataset=eval_dataset,
                  compute_metrics=compute_metrics)

In [54]:
# Entrenamos el modelo
trainer.train()

In [None]:
model_result = trainer.evaluate()
model_result

In [None]:
# Entrenamos el modelo
trainer.save_model()

In [None]:
from transformers import pipeline

In [None]:
model_name= "/kaggle/working/model_electra"
tokenizer_name = 'mrm8488/electricidad-small-discriminator'
cls = pipeline("text-classification", model=model_name, tokenizer=tokenizer_name)
#Probamos el modelo
cls("No puedo más")[0]['label']

In [None]:
# Probamos
cls("Hoy me encuentro fatal")[0]['label']

In [None]:
# Seguimos probando
cls("No te olvides de sacar a pasear a tu mascota")[0]['label']