### Framework imports

In [6]:
from noronha.tools.publish import Publisher
from noronha.tools.shortcuts import data_path, tmp_path

### Application imports

In [7]:
import torch
import numpy as np 
import pandas as pd
import joblib

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

from datasets import load_dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Loading the Dataset

In [8]:
train_dataset = load_dataset('csv', data_files='datasets/atis_intents_train.csv')
eval_dataset = load_dataset('csv', data_files='adatasets/tis_intents_test.csv')


dataset = load_dataset('csv', data_files={'train': 'atis_intents_train.csv', 'test': 'atis_intents_test.csv'}, encoding = "ISO-8859-1")

### Data Labeling

In [9]:
train_texts = [item["text"] for item in dataset["train"]]
train_labels = [item["label"] for item in dataset["train"]]

test_texts = [item["text"] for item in dataset["test"]]
test_labels = [item["label"] for item in dataset["test"]]

label_names = ['atis_flight',          
'atis_airfare',
'atis_ground_service',
'atis_airline',
'atis_abbreviation',
'atis_aircraft',
'atis_flight_time',
'atis_quantity',
'atis_airport',
'atis_distance',
'atis_city',
'atis_ground_fare',
'atis_capacity',
'atis_flight_no',
'atis_meal',
'atis_restriction',
'atis_cheapest']

train_texts, dev_texts, train_labels, dev_labels = train_test_split(train_texts, 
                                                                    train_labels, 
                                                                    test_size=0.2, 
                                                                    shuffle=True, 
                                                                    random_state=1)


### Useful functions

In [None]:
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['label'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc
    }

In [None]:
def label_to_num(word_list):
    word_to_number_map = {}
    number = 0
    result = []
    for word in word_list:
        if word not in word_to_number_map:
            word_to_number_map[word] = number
            number += 1
        result.append(word_to_number_map[word])
    return result

### Defining the model & the Tokenizer

In [None]:
model_id = "albert-base-v2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=len(label_names))

train_texts_encoded = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
dev_texts_encoded = tokenizer(dev_texts, padding=True, truncation=True, return_tensors="pt")
test_texts_encoded = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")

train_labels = label_to_num(train_labels)
dev_labels = label_to_num(dev_labels)
test_labels = label_to_num(test_labels)

train_dataset = ClassificationDataset(train_texts_encoded, train_labels)
dev_dataset = ClassificationDataset(dev_texts_encoded, dev_labels)
test_dataset = ClassificationDataset(test_texts_encoded, test_labels)

### Training Arguments

In [None]:
training_args = TrainingArguments(
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=int(len(train_dataset)/16),
        weight_decay=0.01,
        logging_dir='./logs',
        evaluation_strategy="steps",
        eval_steps=50,
        save_steps=50,
        save_total_limit=10,
        load_best_model_at_end=True,
        save_strategy="no",
    )
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

### Training 

In [None]:
trainer.train()
test_results = trainer.evaluate(test_dataset)

### Model publish

In [None]:
joblib.dump(model, tmp_path('model.pkl')) # model pushing

Publisher()(
    details=dict(
        metrics=test_results
    )
)