In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
! pip install datasets transformers sentencepiece

In [None]:
import transformers
print(transformers.__version__)

import pickle

In [None]:
task_path = '/content/drive/My Drive/Supernova-NLP/intent classification/intent classification'
max_length = 128

## Loading the dataset

In [None]:
import datasets
from datasets import load_metric
from datasets import DatasetDict, Dataset, ClassLabel
import pandas as pd
import matplotlib.pyplot as plt

[link text](https://)Read sentences from excel

In [None]:
df_sentences = pd.read_csv(f'{task_path}/sentiment-data.csv')[["INTENT", "SENTENCES"]]
df_sentences = df_sentences.loc[pd.notnull(df_sentences['INTENT'])]

In [None]:
fig = plt.figure()
ax = fig.subplots()
ax.hist(df_sentences['INTENT'])
plt.show()

Make datasets.Dataset object and split into two sets

In [None]:
classes = list(pd.unique(df_sentences['INTENT']))
classLabel = ClassLabel(num_classes=len(classes), names=classes)

In [None]:
with open(f'{task_path}/models/classLabel.pickle', 'wb') as handle:
    pickle.dump(classLabel, handle)

In [None]:
dataset = Dataset.from_dict({'sentence':df_sentences['SENTENCES'], 'label':classLabel.str2int(df_sentences['INTENT']), 'idx':df_sentences.index})

In [None]:
dataset = dataset.class_encode_column('label')

In [None]:
train_size = 0.9
dataset = dataset.train_test_split(train_size=train_size)

In [None]:
dataset['train'], dataset['test']

In [None]:
dataset['test'][1]

Load Metric. The metric is an instance of [`datasets.Metric`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Metric):

In [None]:
metric = load_metric("accuracy")

In [None]:
metric

## Preprocessing the data

Import tokenizer for XLM-Roberta

In [None]:
from transformers import XLMRobertaTokenizer

tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')#, use_fast=True)

You can directly call this tokenizer on one sentence or a pair of sentences:

In [None]:
#tokenizer("Hello, this one sentence!", "And this sentence goes with it.", truncation=True)

Do all preproccessing with this function.

In [None]:
def preprocess_function(examples):
    # All sentences will be padded ot truncated to max_length
    return tokenizer(examples['sentence'], padding='max_length', truncation=True, max_length=max_length)

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True, load_from_cache_file=False)

In [None]:
encoded_dataset.set_format(type = 'torch', device=device)

## Fine-tuning the model

Import Model and Trainer

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import XLMRobertaForSequenceClassification, XLMRobertaConfig

config = XLMRobertaConfig.from_pretrained('xlm-roberta-base')
config.num_labels = classLabel.num_classes

In [None]:
model = XLMRobertaForSequenceClassification(config)
model.to(device)

In [None]:
! pip install wandb

In [None]:
import wandb
wandb.login()

In [None]:
metric_name = "accuracy"
batch_size = 16

args = TrainingArguments(
    f"{task_path}/models",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=8,
    weight_decay=0.05,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    report_to='wandb'
)

In [None]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
!wandb login --relogin

In [None]:
wandb.init(project="projecName", entity="username")

In [None]:
trainer.train()

We can check with the `evaluate` method that our `Trainer` did reload the best model properly (if it was not the last one):

In [None]:
trainer.evaluate()

In [None]:

 trainer.save_model(task_path)

## Hyperparameter search

The `Trainer` supports hyperparameter search using [optuna](https://optuna.org/) or [Ray Tune](https://docs.ray.io/en/latest/tune/). For this last section you will need either of those libraries installed, just uncomment the line you want on the next cell and run it.

In [None]:
#@title
# ! pip install optuna
# ! pip install ray[tune]

During hyperparameter search, the `Trainer` will run several trainings, so it needs to have the model defined via a function (so it can be reinitialized at each new run) instead of just having it passed. We jsut use the same function as before:

In [None]:
#@title
def model_init():
    config = XLMRobertaConfig.from_pretrained('xlm-roberta-base')
    config.num_labels = classLabel.num_classes
    return XLMRobertaForSequenceClassification(config)

And we can instantiate our `Trainer` like before:

In [None]:
#@title
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_dataset["train"].shard(index=1, num_shards=5) ,
    eval_dataset=encoded_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

The method we call this time is `hyperparameter_search`. Note that it can take a long time to run on the full dataset for some of the tasks. You can try to find some good hyperparameter on a portion of the training dataset by replacing the `train_dataset` line above by:
```python
train_dataset = encoded_dataset["train"].shard(index=1, num_shards=10) 
```
for 1/10th of the dataset. Then you can run a full training on the best hyperparameters picked by the search.

In [None]:
#@title
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

The `hyperparameter_search` method returns a `BestRun` objects, which contains the value of the objective maximized (by default the sum of all metrics) and the hyperparameters it used for that run.

In [None]:
#@title
best_run

You can customize the objective to maximize by passing along a `compute_objective` function to the `hyperparameter_search` method, and you can customize the search space by passing a `hp_space` argument to `hyperparameter_search`. See this [forum post](https://discuss.huggingface.co/t/using-hyperparameter-search-in-trainer/785/10) for some examples.

To reproduce the best training, just set the hyperparameters in your `TrainingArgument` before creating a `Trainer`:

In [None]:
#@title
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

# Demo

In [None]:
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer

Load model, tokenizer and label mapping

In [None]:
model_folder = f'{task_path}'
model = XLMRobertaForSequenceClassification.from_pretrained(model_folder)
tokenizer = XLMRobertaTokenizer.from_pretrained(model_folder)
with open(f'{task_path}/models/classLabel.pickle', 'rb') as handle:
    classLabel = pickle.load(handle)

In [None]:
dataset['test'][540]

Enter sentence here

In [None]:
demo_sentence = "6 სექტემებრის მონაცმებით 2514 ადმიანი დაინფიცირდა"

Tokenize sentence and predict label with model

In [None]:
tokenized = tokenizer(demo_sentence, padding='max_length', truncation=True, max_length=max_length)
input_ids = torch.LongTensor([tokenized['input_ids']])
attention_mask = torch.FloatTensor([tokenized['attention_mask']])

In [None]:
input_ids

In [None]:
attention_mask

In [None]:
outputs = []
for i,j in input_ids,attention_mask:

  label_idx = model.forward(i,j).logits.argmax()
  outputs.append(label_idx)

In [None]:
label_idx = model.forward(input_ids,attention_mask).logits.argmax()
label_idx

In [None]:
for i in
  label = classLabel.int2str([label_idx])
label

In [None]:
  label = classLabel.int2str([label_idx])
label