In [None]:
!pip install transformers datasets nlp

# Fine-tuning a model on a text classification task

In [None]:
task = "test"
model_checkpoint = "dbmdz/electra-base-ukrainian-cased-discriminator"
batch_size = 8

## Loading the dataset

In [None]:
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import string
import re
import os

In [None]:
data = pd.read_csv('../input/newsclass01/train.csv', index_col=0)
data = data.rename({'title': 'sentence1', 'text': 'sentence2', 'source': 'label'}, axis=1)
data.index = data.index.rename('idx')
data

In [None]:
data.info()

In [None]:
test_data = pd.read_csv('../input/newsclass01/test_without_target.csv', index_col=0)
test_data = test_data.drop(columns=['images'], axis=1).rename({'title': 'sentence1', 'text': 'sentence2', 'source': 'label'}, axis=1)
test_data.index = test_data.index.rename('idx')
test_data

In [None]:
X, y = data.drop(columns=['label', 'images']), data['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
import datasets
from datasets import Dataset, DatasetDict, load_metric

dataset = datasets.dataset_dict.DatasetDict({'train': Dataset.from_pandas(pd.concat([X, y], axis=1), split='train'), 'validation': Dataset.from_pandas(pd.concat([X_test, y_test], axis=1), split='validation'), 'test': Dataset.from_pandas(test_data, split='test')})
dataset

## Preprocessing the data

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
tokenizer("Понад півтора мільйони пацієнтів із тяжкими", "Сніг та 20-градусні морози протримаються.")

In [None]:
task_to_keys = {
    "test": ("sentence1", "sentence2"),
}

In [None]:
sentence1_key, sentence2_key = task_to_keys[task]
if sentence2_key is None:
    print(f"Text: {dataset['train'][sentence1_key][0]}")
else:
    print(f"Text: {dataset['train'][sentence1_key][0]}")
    print(f"Title: {dataset['train'][sentence2_key][0]}")

In [None]:
def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

In [None]:
preprocess_function(dataset['train'][:5])

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset

In [None]:
encoded_dataset["validation"][0]

## Fine-tuning the model

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
num_labels = len(data['label'].unique())
metric_name = "f1"
validation_key = "validation"

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
model

In [None]:
args = TrainingArguments(
    "test-Electra",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=6,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [None]:
from sklearn.metrics import f1_score

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)

    f1 = f1_score(labels, predictions, average='macro')
    return {
        'f1': f1,
    }

In [None]:
# model_electra = AutoModelForSequenceClassification.from_pretrained('../input/xlmroberta3/electra/test-RoBERTa/checkpoint-2023', num_labels=num_labels)
# model_electra

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
res = trainer.predict(encoded_dataset['test'])
res

In [None]:
df_preds = pd.DataFrame(res.predictions, index=test_data.index)
df_preds

In [None]:
df_preds.to_csv('preds-electra-6.csv')

In [None]:
df_res = pd.DataFrame(np.argmax(res.predictions, axis=1), index=test_data.index)
df_res.index = df_res.index.rename('Id')
df_res.columns = ['Predicted']
df_res

In [None]:
df_res.to_csv('./submission-ukr-electra-base-6.csv')

In [None]:
model_checkpoint = '../input/xlmroberta3/test-RoBERTa/checkpoint-16179'
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [None]:
args = TrainingArguments(
    "test-RoBERTa-v2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

new_trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
model

In [None]:
new_trainer.train()

In [None]:
res = new_trainer.predict(encoded_dataset['test'])

df_res = pd.DataFrame(np.argmax(res.predictions, axis=1), index=test_data.index)
df_res.index = df_res.index.rename('Id')
df_res.columns = ['Predicted']

df_res.to_csv('./submission-xlm-roberta-large-4.csv')

In [None]:
features = new_trainer.predict(Dataset.from_dict(encoded_dataset['test'][:100])) # Dataset.from_dict({'input_ids': input_ids, 'attention_mask': attention_mask}) .remove_columns('label')
features

In [None]:
features.predictions.shape

In [None]:
def extract_features(data, size):
    res_features = []
    offset = 0
    count = 1000
    while True:
        features = new_trainer.predict(Dataset.from_dict(data[offset:offset + count]))
        res_features.append(np.apply_over_axes(np.mean, features.predictions, [1,]))
        offset += count
        if len(data) < offset:
            break
        print(f'{offset}/{size}')
    return res_features

In [None]:
test_features = np.array([])
for feat in res_features:
    test_features = np.append(test_features, feat)
    print(test_features.shape)
test_features = test_features.reshape((len(test_data), 1024))
test_features.shape

In [None]:
df_res = pd.DataFrame(test_features, index=test_data.index)
df_res

In [None]:
df_res.info()

In [None]:
df_res.to_csv('./test-features.csv')

In [None]:
res_features = extract_features(encoded_dataset['train'].remove_columns('label'), len(data))
res_features

In [None]:
train_features = np.array([])
for feat in res_features:
    train_features = np.append(train_features, feat)
    print(train_features.shape)
train_features = train_features.reshape((len(data), 1024))
train_features.shape

In [None]:
df_train = pd.DataFrame(train_features, index=data.index)
df_train

In [None]:
df_train.to_csv('./train-features.csv')

In [None]:
import shutil

shutil.copytree('./test-RoBERTa', '/content/drive/MyDrive/contest/models/RoBERTa-large')

In [None]:
! rm -rf ./test-RoBERTa/checkpoint-5393/trainer_state.json

In [None]:
! ls ./test-Electra

In [None]:
! cat ./test-Electra/checkpoint-12138/trainer_state.json

In [None]:
!zip -r ./electra-6.zip ./test-Electra/checkpoint-12138

<a href="./electra-6.zip"> Download File </a>

In [None]:
model_checkpoint = ''

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

And we can instantiate our `Trainer` like before:

In [None]:
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

The method we call this time is `hyperparameter_search`. Note that it can take a long time to run on the full dataset for some of the tasks. You can try to find some good hyperparameter on a portion of the training dataset by replacing the `train_dataset` line above by:
```python
train_dataset = encoded_dataset["train"].shard(index=1, num_shards=10) 
```
for 1/10th of the dataset. Then you can run a full training on the best hyperparameters picked by the search.

In [None]:
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

The `hyperparameter_search` method returns a `BestRun` objects, which contains the value of the objective maximized (by default the sum of all metrics) and the hyperparameters it used for that run.

In [None]:
best_run

You can customize the objective to maximize by passing along a `compute_objective` function to the `hyperparameter_search` method, and you can customize the search space by passing a `hp_space` argument to `hyperparameter_search`. See this [forum post](https://discuss.huggingface.co/t/using-hyperparameter-search-in-trainer/785/10) for some examples.

To reproduce the best training, just set the hyperparameters in your `TrainingArgument` before creating a `Trainer`:

In [None]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

Don't forget to [update your model](https://huggingface.co/transformers/model_sharing.html) on the [🤗 Model Hub](https://huggingface.co/models). You can then use it only to generate results like the one shown in the first picture of this notebook!

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session