# IMDB

In [1]:
from datasets import load_dataset
imdb = load_dataset("imdb")
imdb["test"][0]

  from .autonotebook import tqdm as notebook_tqdm


{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_imdb = imdb.map(preprocess_function, batched=True)

In [3]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

import evaluate
accuracy = evaluate.load("accuracy")

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [6]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [8]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
for param in model.base_model.parameters():
    param.requires_grad = False

In [12]:
from torchinfo import summary

summary(model, col_names=("num_params", "trainable"))

Layer (type:depth-idx)                                  Param #                   Trainable
DistilBertForSequenceClassification                     --                        Partial
├─DistilBertModel: 1-1                                  --                        False
│    └─Embeddings: 2-1                                  --                        False
│    │    └─Embedding: 3-1                              (23,440,896)              False
│    │    └─Embedding: 3-2                              (393,216)                 False
│    │    └─LayerNorm: 3-3                              (1,536)                   False
│    │    └─Dropout: 3-4                                --                        --
│    └─Transformer: 2-2                                 --                        False
│    │    └─ModuleList: 3-5                             (42,527,232)              False
├─Linear: 1-2                                           590,592                   True
├─Linear: 1-3                 

In [None]:
trainer.train()

# Chatbot

In [99]:
import pandas as pd

df = pd.read_csv("category.csv", sep=";")
df.head()

Unnamed: 0,text,label
0,Hola,saludo
1,Cómo estás,saludo
2,como estas,saludo
3,holaaaa,saludo
4,holap,saludo


In [100]:
id2label = {0: "saludo", 1: "despedida", 2: "equipo"}
label2id = {"saludo": 0, "despedida": 1, "equipo": 2}

df['label'] = df['label'].replace(label2id)

  df['label'] = df['label'].replace(label2id)


In [101]:
sentence = df.iloc[122]['text']
sentence

'¿Quiénes están en su equipo?'

In [102]:
def apply_lemma(sentence):
    doc = nlp(sentence)
    lemma = [w.lemma_ for w in doc if w.pos_ != 'PUNCT']
    return ' '.join(lemma).lower()

df['text'] = df['text'].apply(apply_lemma)

In [103]:
df.iloc[122]['text']

'quién estar en su equipo'

In [104]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, stratify=df["label"])

In [106]:
from datasets import Dataset
train_ds = Dataset.from_pandas(train, split="train", preserve_index=False)
test_ds = Dataset.from_pandas(test, split="test", preserve_index=False)

In [107]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = train_ds.map(preprocess_function, batched=True)


Map: 100%|██████████| 132/132 [00:00<00:00, 35716.93 examples/s]

Map: 100%|██████████| 132/132 [00:00<00:00, 32052.81 examples/s]


In [108]:
test_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 34
})

In [109]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

import evaluate
accuracy = evaluate.load("accuracy")

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [110]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-multilingual-cased", num_labels=3, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="model_ckp",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [111]:
for param in model.base_model.parameters():
    param.requires_grad = False

In [112]:
from torchinfo import summary
summary(model, col_names=("num_params", "trainable"))

Layer (type:depth-idx)                                  Param #                   Trainable
DistilBertForSequenceClassification                     --                        Partial
├─DistilBertModel: 1-1                                  --                        False
│    └─Embeddings: 2-1                                  --                        False
│    │    └─Embedding: 3-1                              (91,812,096)              False
│    │    └─Embedding: 3-2                              (393,216)                 False
│    │    └─LayerNorm: 3-3                              (1,536)                   False
│    │    └─Dropout: 3-4                                --                        --
│    └─Transformer: 2-2                                 --                        False
│    │    └─ModuleList: 3-5                             (42,527,232)              False
├─Linear: 1-2                                           590,592                   True
├─Linear: 1-3                 

In [113]:
trainer.train()

  0%|          | 0/60 [03:48<?, ?it/s]

                                      

[A[A                               
  0%|          | 0/60 [03:17<?, ?it/s]        
[A
[A

{'eval_loss': 1.0894174575805664, 'eval_accuracy': 0.3484848484848485, 'eval_runtime': 0.1382, 'eval_samples_per_second': 955.177, 'eval_steps_per_second': 21.709, 'epoch': 1.0}



                                      
[A                                           

  0%|          | 0/60 [03:19<?, ?it/s]
[A
[A

{'eval_loss': 1.084415316581726, 'eval_accuracy': 0.3484848484848485, 'eval_runtime': 0.1163, 'eval_samples_per_second': 1135.39, 'eval_steps_per_second': 25.804, 'epoch': 2.0}



                                      
[A                                           

  0%|          | 0/60 [03:20<?, ?it/s]
[A
[A

{'eval_loss': 1.079811692237854, 'eval_accuracy': 0.3484848484848485, 'eval_runtime': 0.1173, 'eval_samples_per_second': 1125.237, 'eval_steps_per_second': 25.574, 'epoch': 3.0}



                                      
[A                                            

  0%|          | 0/60 [03:21<?, ?it/s]
[A
[A

{'eval_loss': 1.075578212738037, 'eval_accuracy': 0.3484848484848485, 'eval_runtime': 0.1172, 'eval_samples_per_second': 1126.489, 'eval_steps_per_second': 25.602, 'epoch': 4.0}



                                      
[A                                            

  0%|          | 0/60 [03:23<?, ?it/s]
[A
[A

{'eval_loss': 1.0716314315795898, 'eval_accuracy': 0.3560606060606061, 'eval_runtime': 0.1197, 'eval_samples_per_second': 1102.81, 'eval_steps_per_second': 25.064, 'epoch': 5.0}



                                      
[A                                            

  0%|          | 0/60 [03:25<?, ?it/s]
[A
[A

{'eval_loss': 1.068103551864624, 'eval_accuracy': 0.3560606060606061, 'eval_runtime': 0.1321, 'eval_samples_per_second': 999.522, 'eval_steps_per_second': 22.716, 'epoch': 6.0}



                                      
[A                                            

  0%|          | 0/60 [03:28<?, ?it/s]
[A
[A

{'eval_loss': 1.0647814273834229, 'eval_accuracy': 0.3560606060606061, 'eval_runtime': 0.132, 'eval_samples_per_second': 999.897, 'eval_steps_per_second': 22.725, 'epoch': 7.0}



                                      
[A                                            

  0%|          | 0/60 [03:29<?, ?it/s]
[A
[A

{'eval_loss': 1.061734676361084, 'eval_accuracy': 0.3560606060606061, 'eval_runtime': 0.1179, 'eval_samples_per_second': 1120.011, 'eval_steps_per_second': 25.455, 'epoch': 8.0}



                                      
[A                                            

  0%|          | 0/60 [03:30<?, ?it/s]
[A
[A

{'eval_loss': 1.0588958263397217, 'eval_accuracy': 0.36363636363636365, 'eval_runtime': 0.119, 'eval_samples_per_second': 1109.042, 'eval_steps_per_second': 25.205, 'epoch': 9.0}



                                      
[A                                            

  0%|          | 0/60 [03:32<?, ?it/s]
[A
[A

{'eval_loss': 1.056265950202942, 'eval_accuracy': 0.4015151515151515, 'eval_runtime': 0.1174, 'eval_samples_per_second': 1124.156, 'eval_steps_per_second': 25.549, 'epoch': 10.0}



                                      
[A                                            

  0%|          | 0/60 [03:33<?, ?it/s]
[A
[A

{'eval_loss': 1.0538983345031738, 'eval_accuracy': 0.4090909090909091, 'eval_runtime': 0.1183, 'eval_samples_per_second': 1116.179, 'eval_steps_per_second': 25.368, 'epoch': 11.0}



                                      
[A                                            

  0%|          | 0/60 [03:35<?, ?it/s]
[A
[A

{'eval_loss': 1.051780104637146, 'eval_accuracy': 0.42424242424242425, 'eval_runtime': 0.1216, 'eval_samples_per_second': 1085.323, 'eval_steps_per_second': 24.666, 'epoch': 12.0}



                                      
[A                                            

  0%|          | 0/60 [03:36<?, ?it/s]
[A
[A

{'eval_loss': 1.0499203205108643, 'eval_accuracy': 0.45454545454545453, 'eval_runtime': 0.117, 'eval_samples_per_second': 1128.496, 'eval_steps_per_second': 25.648, 'epoch': 13.0}



                                      
[A                                            

  0%|          | 0/60 [03:37<?, ?it/s]
[A
[A

{'eval_loss': 1.048315405845642, 'eval_accuracy': 0.4772727272727273, 'eval_runtime': 0.1169, 'eval_samples_per_second': 1129.161, 'eval_steps_per_second': 25.663, 'epoch': 14.0}



                                      
[A                                            

  0%|          | 0/60 [03:39<?, ?it/s]
[A
[A

{'eval_loss': 1.046942114830017, 'eval_accuracy': 0.5378787878787878, 'eval_runtime': 0.1177, 'eval_samples_per_second': 1121.694, 'eval_steps_per_second': 25.493, 'epoch': 15.0}



                                      
[A                                            

  0%|          | 0/60 [03:40<?, ?it/s]
[A
[A

{'eval_loss': 1.0458159446716309, 'eval_accuracy': 0.553030303030303, 'eval_runtime': 0.1173, 'eval_samples_per_second': 1125.601, 'eval_steps_per_second': 25.582, 'epoch': 16.0}



                                      
[A                                            

  0%|          | 0/60 [03:41<?, ?it/s]
[A
[A

{'eval_loss': 1.0449113845825195, 'eval_accuracy': 0.5606060606060606, 'eval_runtime': 0.1196, 'eval_samples_per_second': 1103.857, 'eval_steps_per_second': 25.088, 'epoch': 17.0}



                                      
[A                                            

  0%|          | 0/60 [03:43<?, ?it/s]
[A
[A

{'eval_loss': 1.0442523956298828, 'eval_accuracy': 0.5606060606060606, 'eval_runtime': 0.1193, 'eval_samples_per_second': 1106.646, 'eval_steps_per_second': 25.151, 'epoch': 18.0}



                                      
[A                                            

  0%|          | 0/60 [03:44<?, ?it/s]
[A
[A

{'eval_loss': 1.04384446144104, 'eval_accuracy': 0.5606060606060606, 'eval_runtime': 0.1211, 'eval_samples_per_second': 1089.852, 'eval_steps_per_second': 24.769, 'epoch': 19.0}



                                      
[A                                            

  0%|          | 0/60 [03:46<?, ?it/s]
[A
[A

{'eval_loss': 1.0436853170394897, 'eval_accuracy': 0.5606060606060606, 'eval_runtime': 0.1202, 'eval_samples_per_second': 1097.994, 'eval_steps_per_second': 24.954, 'epoch': 20.0}


                                      
100%|██████████| 60/60 [00:29<00:00,  2.01it/s]

{'train_runtime': 29.8246, 'train_samples_per_second': 88.518, 'train_steps_per_second': 2.012, 'train_loss': 1.0698511759440104, 'epoch': 20.0}





TrainOutput(global_step=60, training_loss=1.0698511759440104, metrics={'train_runtime': 29.8246, 'train_samples_per_second': 88.518, 'train_steps_per_second': 2.012, 'total_flos': 9718933795128.0, 'train_loss': 1.0698511759440104, 'epoch': 20.0})

# Testing

In [118]:
from transformers import AutoTokenizer

text = "como se llama su equipo?"

tokenizer = AutoTokenizer.from_pretrained("model_ckp/checkpoint-60")
inputs = tokenizer(text, return_tensors="pt")

from transformers import AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained("model_ckp/checkpoint-60")
with torch.no_grad():
    logits = model(**inputs).logits

print(logits)
print(torch.softmax(logits,dim=1))

predicted_class_id = logits.argmax().item()

model.config.id2label[predicted_class_id]

tensor([[ 0.0919, -0.0365, -0.0486]])
tensor([[0.3638, 0.3200, 0.3162]])


'saludo'