# Predict selected data using fine-tuned models

In [1]:
import os
import aiohttp

import pandas as pd
import numpy as np
import torch

import transformers
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import (AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments)
from datasets import load_dataset

from imblearn.over_sampling import RandomOverSampler

import os
os.environ['TRANSFORMERS_CACHE'] = 'data/volume_1/cache_hf'
os.environ['HF_HOME'] = 'data/volume_1/cache_hf'

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: GeForce RTX 2080 Ti


## Load tokenizer and set parameters

In [23]:
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

MAX_LENGHT = 512
DATA_DIR = "/home/leonardovida/dev/hist-aware/notebooks/data/labeled-full/split_labeled/merged_split/"

# For tokenizer
checkpoint = "wietsedv/bert-base-dutch-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30073, 768, padding_idx=3)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Predict

In [24]:
DECADE = "1960"
TYPE = "coal"
selected = pd.read_csv(f"/home/leonardovida/data/volume_1/data-histaware/4-selected-data/{DECADE}/{DECADE}s_{TYPE}_labeled_full_0.95.csv")
model = AutoModelForSequenceClassification.from_pretrained("/home/leonardovida/data/volume_1/delphbert-results/6-finetuning-outputs/1960s_coal", local_files_only=True)

In [4]:
dataset = load_dataset(
    'csv',
    split="train",
    data_files = f"/home/leonardovida/data/volume_1/data-histaware/4-selected-data/{DECADE}/{DECADE}s_{TYPE}_labeled_full_0.95.csv"
)

Using custom data configuration default-89cd6e46ba25f4dd
Reusing dataset csv (/home/leonardovida/.cache/huggingface/datasets/csv/default-89cd6e46ba25f4dd/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)


In [16]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [18]:
dataset_tokenized = tokenizer(dataset["p"], padding=True, truncation=True, max_length=512)

In [20]:
dataset_prediction = Dataset(dataset_tokenized)

In [27]:
pred_trainer = Trainer(model)
y_preds, _, _ = pred_trainer.predict(dataset_prediction)

In [30]:
y_pred = np.argmax(y_preds, axis=1)
len(y_pred)

4626