In [1]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
!pip install transformers



In [2]:
import pandas as pd
import torch
from datasets import Dataset, load_metric
import random


In [3]:
def shuffle_df(old_df: pd.DataFrame, cycles: int = 1) -> pd.DataFrame:
  for i in range(cycles):
    new_df = old_df.sample(frac=1).reset_index(drop=True)
  return new_df

In [4]:
import datasets
import random
from transformers import AutoTokenizer

data = {'text': [], 'class': []}


with open(r'water_requests.txt', 'r') as f:
    for line in f:
        line = line.replace('\n', '')
        line = line.replace('...', ',')
        line = line.lstrip('"').rstrip('"')
        data['text'].append(line)
        data['class'].append('water')


with open(r'not_water_requests.txt', 'r') as f:
    for line in f:
        line = line.replace('\n', '')
        line = line.replace('...', ',')
        line = line.lstrip('"').rstrip('"')
        data['text'].append(line)
        data['class'].append('others')


your_dataset = datasets.Dataset.from_dict(data)


In [5]:
t = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
template = "This example is {}."

id2labels = ["water", "others"]


def create_input_sequence(sample):
    text = sample["text"]
    label = sample["class"][0]
    contradiction_label = random.choice([x for x in id2labels if x != label])

    encoded_sequence = t(
        text*2, [template.format(label), template.format(contradiction_label)])
    encoded_sequence["labels"] = [1, 0]
    encoded_sequence["input_sentence"] = t.batch_decode(
        encoded_sequence.input_ids)

    return encoded_sequence

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
your_dataset = your_dataset.train_test_split(test_size=0.2)
train_ds = your_dataset["train"]
test_ds = your_dataset["test"]

In [7]:
train_dataset = train_ds.map(
    create_input_sequence, batched=True, batch_size=1, remove_columns=["class", "text"])
test_dataset = test_ds.map(create_input_sequence, batched=True,
                           batch_size=1, remove_columns=["class", "text"])

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Map:   0%|          | 0/58 [00:00<?, ? examples/s]

In [8]:
from transformers import BartForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
import numpy as np

In [9]:
from transformers import BartTokenizerFast
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-mnli')

In [10]:
def compute_metrics(p: EvalPrediction):
  metric_acc = load_metric("accuracy")
  metric_f1 = load_metric("f1")
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  preds = np.argmax(preds, axis = 1)
  result = {}
  result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
  result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"]
  return result

In [11]:
model_directory = r'/content'

In [12]:
model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli", num_labels = len(id2labels), ignore_mismatched_sizes = True)

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = TrainingArguments(
  output_dir = model_directory,      # Output directory
  num_train_epochs = 32,             # Total number of training epochs
  per_device_train_batch_size = 16,  # Batch size per device during training
  per_device_eval_batch_size = 64,   # Batch size for evaluation
  warmup_steps = 500,                # Number of warmup steps for learning rate scheduler
  weight_decay = 0.01,               # Strength of weight decay
)

trainer = Trainer(
  model = model,                     # The instantiated model to be trained
  args = training_args,              # Training arguments, defined above
  compute_metrics = compute_metrics, # A function to compute the metrics
  train_dataset = train_dataset,     # Training dataset
  eval_dataset = test_dataset,       # Evaluation dataset
  tokenizer = tokenizer              # The tokenizer that was used
)

In [14]:
trainer.evaluate()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  metric_acc = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'eval_loss': 0.6143875122070312,
 'eval_accuracy': 0.7068965517241379,
 'eval_f1': 0.6887626262626263,
 'eval_runtime': 6.145,
 'eval_samples_per_second': 18.877,
 'eval_steps_per_second': 0.325}

In [15]:
trainer.train()

Step,Training Loss
500,0.0309


TrainOutput(global_step=928, training_loss=0.016635230480209554, metrics={'train_runtime': 431.4708, 'train_samples_per_second': 34.116, 'train_steps_per_second': 2.151, 'total_flos': 776922466067760.0, 'train_loss': 0.016635230480209554, 'epoch': 32.0})

In [16]:
trainer.evaluate()

{'eval_loss': 1.333910518042103e-06,
 'eval_accuracy': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 2.8975,
 'eval_samples_per_second': 40.034,
 'eval_steps_per_second': 0.69,
 'epoch': 32.0}

In [17]:
from transformers import pipeline

In [18]:
classifier = pipeline("zero-shot-classification", model = model, tokenizer = tokenizer, device = 0)

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


In [19]:
sequences = 'I need water'

In [20]:
classifier(sequences, id2labels, multi_label=False)

{'sequence': 'I need water',
 'labels': ['water', 'others'],
 'scores': [0.9999986886978149, 1.3524949054044555e-06]}

In [25]:
sequences = 'I am thirsty'

In [26]:
classifier(sequences, id2labels, multi_label=False)

{'sequence': 'I am thirsty',
 'labels': ['water', 'others'],
 'scores': [0.9981848001480103, 0.001815195893868804]}

In [27]:
sequences = 'I am hungry'

In [28]:
classifier(sequences, id2labels, multi_label=False)

{'sequence': 'I am hungry',
 'labels': ['others', 'water'],
 'scores': [0.5975476503372192, 0.402452290058136]}