In [None]:
!pip install datasets
!pip install transformers

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir("/content/drive/My Drive/Colab Notebooks/Multi Indent")

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
intents_df = pd.read_csv('./intents.csv')
user_queries_df = pd.read_csv('./user_queries.csv')

In [None]:
# Paragraph based on:
# https://www.kaggle.com/code/cbentes/intent-classification-with-tartunlp-estbert

from datasets import Dataset #https://huggingface.co/docs/transformers/preprocessing

query_map = {row['query_id']: row['query'].lower() for _, row in user_queries_df.iterrows()}
intent_map = {row['intent_id']:row['intent']  for _, row in intents_df.iterrows()}

intent_index = {intent_id:i for i, intent_id in enumerate(sorted(intents_df.intent_id.unique()))}
index_intent = {v:k for k,v in intent_index.items()}
n_intend = intents_df.intent_id.nunique()

def get_encoded_label(intents):
    label = [0] * n_intend
    for intent in intents:
        label[intent_index[intent]] = 1
    return label

def get_decoded_label(label):
    return [index_intent[index] for index, is_set in enumerate(label) if is_set]

In [None]:
# Paragraph based on:
# https://www.kaggle.com/code/cbentes/intent-classification-with-tartunlp-estbert

_data = pd.DataFrame([{
    'text': query_map[row.query_id], 
    'label': get_encoded_label(row.intents.split())
} for _, row in train_df.iterrows()])

_train = _data.sample(int(0.8*_data.shape[0]))
_validation = _data[~_data.index.isin(_train.index)]

train = Dataset.from_pandas(_train.reset_index(drop=True)) # https://huggingface.co/docs/datasets/loading
validation = Dataset.from_pandas(_validation.reset_index(drop=True)) # https://huggingface.co/docs/datasets/loading

_test = pd.DataFrame([{
    'text': query_map[row.query_id]
} for _, row in test_df.iterrows()])

test = Dataset.from_pandas(_test)

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

In [None]:
MODEL = "tartuNLP/EstBERT"
# MODEL = "xlm-roberta-base" #0.7
# MODEL = "xlm-roberta-large-finetuned-conll03-english" #0.7
# MODEL = "bert-base-multilingual-uncase"
# MODEL = "tartuNLP/EstBERT_Morph_128"
# MODEL = "tartuNLP/EstBERT_NER_v2" #0.7
# MODEL = "tartuNLP/EstBERT_Morph_128" #0.78
# MODEL = "tartuNLP/EstBERT_XPOS_128"
# MODEL = "tartuNLP/EstBERT_UPOS_128"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

Downloading:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/534 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--tartuNLP--EstBERT/snapshots/ea615e186cd9a402edb90b7cfacfdcdc79893736/config.json
Model config BertConfig {
  "_name_or_path": "tartuNLP/EstBERT",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50000
}



Downloading:   0%|          | 0.00/410k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--tartuNLP--EstBERT/snapshots/ea615e186cd9a402edb90b7cfacfdcdc79893736/vocab.txt
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--tartuNLP--EstBERT/snapshots/ea615e186cd9a402edb90b7cfacfdcdc79893736/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--tartuNLP--EstBERT/snapshots/ea615e186cd9a402edb90b7cfacfdcdc79893736/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--tartuNLP--EstBERT/snapshots/ea615e186cd9a402edb90b7cfacfdcdc79893736/config.json
Model config BertConfig {
  "_name_or_path": "tartuNLP/EstBERT",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids":

In [None]:
# Paragraph based on:
# https://www.kaggle.com/code/cbentes/intent-classification-with-tartunlp-estbert
# and
# https://huggingface.co/docs/transformers/preprocessing
def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding=True, truncation=True)

tokenized_train = train.map(tokenize_function, batched=True).remove_columns('text')
tokenized_validation = validation.map(tokenize_function, batched=True).remove_columns('text')
tokenized_test = test.map(tokenize_function, batched=True).remove_columns('text')

  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
from transformers import TrainingArguments
from transformers import Trainer
from datasets import load_metric

In [None]:
# Paragraph based on:
# https://www.kaggle.com/code/cbentes/intent-classification-with-tartunlp-estbert
# and 
# https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb

training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    num_train_epochs=128,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    push_to_hub=False,
    report_to="none",
    optim="adamw_torch"
)

def encoded_from_pred(pred):
    return [1 if x > 0.5 else 0 for x in np.exp(pred)]

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL, 
    num_labels=n_intend, 
    problem_type="multi_label_classification",
    ignore_mismatched_sizes=True,
  )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation
  )

PyTorch: setting up devices
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--tartuNLP--EstBERT/snapshots/ea615e186cd9a402edb90b7cfacfdcdc79893736/config.json
Model config BertConfig {
  "_name_or_path": "tartuNLP/EstBERT",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21

Downloading:   0%|          | 0.00/498M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--tartuNLP--EstBERT/snapshots/ea615e186cd9a402edb90b7cfacfdcdc79893736/pytorch_model.bin
Some weights of the model checkpoint at tartuNLP/EstBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassificatio

In [None]:
trainer.train()

***** Running training *****
  Num examples = 140
  Num Epochs = 128
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1152
  Number of trainable parameters = 124464414


Epoch,Training Loss,Validation Loss
1,No log,0.37938
2,No log,0.283718
3,No log,0.245981
4,No log,0.227324
5,No log,0.211456
6,No log,0.197828
7,No log,0.184171
8,No log,0.175243
9,No log,0.169216
10,No log,0.161703


***** Running Evaluation *****
  Num examples = 35
  Batch size = 16
***** Running Evaluation *****
  Num examples = 35
  Batch size = 16
***** Running Evaluation *****
  Num examples = 35
  Batch size = 16
***** Running Evaluation *****
  Num examples = 35
  Batch size = 16
***** Running Evaluation *****
  Num examples = 35
  Batch size = 16
***** Running Evaluation *****
  Num examples = 35
  Batch size = 16
***** Running Evaluation *****
  Num examples = 35
  Batch size = 16
***** Running Evaluation *****
  Num examples = 35
  Batch size = 16
***** Running Evaluation *****
  Num examples = 35
  Batch size = 16
***** Running Evaluation *****
  Num examples = 35
  Batch size = 16
***** Running Evaluation *****
  Num examples = 35
  Batch size = 16
***** Running Evaluation *****
  Num examples = 35
  Batch size = 16
***** Running Evaluation *****
  Num examples = 35
  Batch size = 16
***** Running Evaluation *****
  Num examples = 35
  Batch size = 16
***** Running Evaluation *****
  N

TrainOutput(global_step=1152, training_loss=0.05273294635117054, metrics={'train_runtime': 340.9284, 'train_samples_per_second': 52.562, 'train_steps_per_second': 3.379, 'total_flos': 782952174950400.0, 'train_loss': 0.05273294635117054, 'epoch': 128.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 35
  Batch size = 16


{'eval_loss': 0.12067262083292007,
 'eval_runtime': 0.0737,
 'eval_samples_per_second': 474.687,
 'eval_steps_per_second': 40.687,
 'epoch': 128.0}

In [None]:
# Paragraph based on:
# https://www.kaggle.com/code/cbentes/intent-classification-with-tartunlp-estbert

val_pred = trainer.predict(tokenized_validation)

val_df = pd.DataFrame()
val_df['reference'] = _validation.label.apply(lambda x: ' '.join(get_decoded_label(x)))
val_df['predict'] = [' '.join(get_decoded_label(encoded_from_pred(x))) for x in val_pred.predictions]

predictions = trainer.predict(tokenized_test)

predictions_indexes = [encoded_from_pred(x) for x in predictions.predictions]
intents = [' '.join(get_decoded_label(p)) for p in predictions_indexes]

submission_df = pd.DataFrame(data={'query_id': test_df.query_id.values, 'intents': intents})

name = '_'.join(MODEL.split('/'))
submission_df.to_csv(f'{name}.csv',index=False)

***** Running Prediction *****
  Num examples = 35
  Batch size = 16


***** Running Prediction *****
  Num examples = 75
  Batch size = 16
