In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizerFast, DataCollatorWithPadding, pipeline
from datasets import load_metric, Dataset


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#Leer cvs
data=pd.read_csv('Data/amazon_reviews.csv')

#Mostrar 10 primeras data
data.head(5)

Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4.0,No issues.,2014-07-23,138,0,0,0,0,0.0,0.0
1,1,0mie,5.0,"Purchased this for my device, it worked as adv...",2013-10-25,409,0,0,0,0,0.0,0.0
2,2,1K3,4.0,it works as expected. I should have sprung for...,2012-12-23,715,0,0,0,0,0.0,0.0
3,3,1m2,5.0,This think has worked out great.Had a diff. br...,2013-11-21,382,0,0,0,0,0.0,0.0
4,4,2&amp;1/2Men,5.0,"Bought it with Retail Packaging, arrived legit...",2013-07-13,513,0,0,0,0,0.0,0.0


In [5]:
#Eliminar Columnas irrelevantes
data.drop(data.columns[0], axis=1, inplace=True)
data.drop(['reviewerName', 'helpful_yes', 'helpful_no', 'reviewTime','day_diff','score_pos_neg_diff','score_average_rating'],
          axis=1, inplace=True)

In [6]:
#Borramos los datos nulos existentes debido a que representan
#Un valor insignificante ante el dataset completo
data=data.dropna()

In [7]:
# De acuerdo al rating crearemos una nueva columna que clasificara el sentimiento del texto como positivo (2), negativo (0) o neutro (1)

# Definimos las condiciones
conditions = [
    data['overall'].isin([1, 2]),
    data['overall'] == 3,
    data['overall'].isin([4, 5])
]

# Definimos las opciones
unique_sentiments = [0, 1, 2]
unique_label_sentiments = ['negative', 'neutral', 'positive']

# Creamos la nueva columna
data['sentiment'] = np.select(conditions, unique_sentiments)
data['sentiment_label'] = np.select(conditions, unique_label_sentiments)

data.head(5)

Unnamed: 0,overall,reviewText,total_vote,wilson_lower_bound,sentiment,sentiment_label
0,4.0,No issues.,0,0.0,2,positive
1,5.0,"Purchased this for my device, it worked as adv...",0,0.0,2,positive
2,4.0,it works as expected. I should have sprung for...,0,0.0,2,positive
3,5.0,This think has worked out great.Had a diff. br...,0,0.0,2,positive
4,5.0,"Bought it with Retail Packaging, arrived legit...",0,0.0,2,positive


In [8]:
# Almacenamos los datos en arreglos individuales
ratings = data['overall'].tolist()
texts = data['reviewText'].tolist()
sentiments = data['sentiment'].tolist()
sentiments_labels = data['sentiment_label'].tolist()

In [9]:
# Verificamos el primer elementos de los arreglos
print(ratings[1])
print(texts[1])
print(sentiments[1])
print(sentiments_labels[1])

5.0
Purchased this for my device, it worked as advertised. You can never have too much phone memory, since I download a lot of stuff this was a no brainer for me.
2
positive


In [10]:
# Creamos un objeto diccionario requerido por el modelo BERT
data_dict = Dataset.from_dict(
    dict(
        text = texts,
        label = sentiments
    )
)

# Separamos los datos en entrenamiento y prueba
data_dict = data_dict.train_test_split(test_size=0.2)

# Verificamos el primer elemento
data_dict['train'][0]

{'text': "I bought the SanDisk Ultra 64 GB for my new Samsung Galaxy S4. Being that it's the faster type of memory I like to think it reads and writes fast. And I feel like it does, but it's in my phone so who knows. Now for your highend DSLR you'll notice a difference and be very pleased.",
 'label': 2}

In [11]:
# Descargamos el tokenizador para el modelo especifico de bert
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [12]:
# Función para tokenizar por lotes expresiones con truncamiento
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [13]:
# Lo aplicamos a los datos
tokenized_data = data_dict.map(preprocess_function, batched=True)

Map: 100%|██████████| 3931/3931 [00:00<00:00, 7699.80 examples/s]
Map: 100%|██████████| 983/983 [00:00<00:00, 11987.66 examples/s]


In [14]:
tokenized_data['train'][0]

{'text': "I bought the SanDisk Ultra 64 GB for my new Samsung Galaxy S4. Being that it's the faster type of memory I like to think it reads and writes fast. And I feel like it does, but it's in my phone so who knows. Now for your highend DSLR you'll notice a difference and be very pleased.",
 'label': 2,
 'input_ids': [101,
  1045,
  4149,
  1996,
  5472,
  20573,
  11087,
  4185,
  16351,
  2005,
  2026,
  2047,
  19102,
  9088,
  1055,
  2549,
  1012,
  2108,
  2008,
  2009,
  1005,
  1055,
  1996,
  5514,
  2828,
  1997,
  3638,
  1045,
  2066,
  2000,
  2228,
  2009,
  9631,
  1998,
  7009,
  3435,
  1012,
  1998,
  1045,
  2514,
  2066,
  2009,
  2515,
  1010,
  2021,
  2009,
  1005,
  1055,
  1999,
  2026,
  3042,
  2061,
  2040,
  4282,
  1012,
  2085,
  2005,
  2115,
  2152,
  10497,
  16233,
  20974,
  2017,
  1005,
  2222,
  5060,
  1037,
  4489,
  1998,
  2022,
  2200,
  7537,
  1012,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


In [15]:
# Creamos un lote de datos, asi como normalizamos todos los textos a la longitud del mas gramde aplicando el padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
# Definimos el modelo
distilBert_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels = len(unique_label_sentiments)
    )

# Establecemos un indice para detectar a que label esta asignado cada clase 
# 0: negativo, 1: neutro y 2: positivo
distilBert_model.config.id2label = {i: l for i, l in enumerate(unique_label_sentiments)}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Verificamos que se hayan añadido los labels correctamente
print(distilBert_model.config.id2label[0])
print(distilBert_model.config.id2label[1])
print(distilBert_model.config.id2label[2])

negative
neutral
positive


In [19]:
# Debido a que el modelo por default usa la metrica de perdida, añadiremos la metrica de precision tambien
metric = load_metric("accuracy", trust_remote_code=True)

# Definimos una funcion para calcular la precision del conjunto de evaluacion
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [23]:
# Definimos el numero de epocas
epochs = 5

# Definimos los argumentos del entrenamiento del modelo
training_args = TrainingArguments(
    output_dir = "Results",
    num_train_epochs = epochs,
    per_device_train_batch_size = 10,
    per_device_eval_batch_size = 10,
    load_best_model_at_end = True,

    # Pasos de calentamiento para el programador de tasa de aprendizaje
    warmup_steps = len(tokenized_data['train']) // 3,
    weight_decay = 0.05,

    logging_steps = 1,
    log_level = 'info',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch'
)

# Definimos el entrenador
trainer = Trainer(
    model = distilBert_model,
    args = training_args,
    train_dataset = tokenized_data['train'],
    eval_dataset = tokenized_data['test'],
    compute_metrics = compute_metrics,
    data_collator = data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [24]:
# Evaluamos el modelo una vez para obtener las metricas iniciales
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 983
  Batch size = 10
100%|██████████| 99/99 [01:52<00:00,  1.13s/it]


{'eval_loss': 1.2258497476577759,
 'eval_accuracy': 0.054933875890132246,
 'eval_runtime': 113.6068,
 'eval_samples_per_second': 8.653,
 'eval_steps_per_second': 0.871}

In [None]:
# Ponemos a entrenar el modelo
trainer.train()

In [None]:
# Evaluamos despues del entrenamiento
trainer.evaluate()

In [None]:
# Probamos el modelo con una reseña creada aleatoriamente
pipe = pipeline("text-classification", distilBert_model, tokenizer=tokenizer)
pipe('without problems')

In [None]:
# Guardamos el modelo
trainer.save_model()
tokenizer.save_pretrained("/content/drive/MyDrive/Bert_model/results/tokenizer")