In [1]:
import pandas as pd
import numpy as np

from itertools import product

import torch
from torch.utils.data import DataLoader

from tqdm.auto import tqdm

from models.Wd_Xlm_T import Wd_Xlm_T
from models.TweetDataset import TweetDataset

import transformers 

from utils.utils import (load_test_data, target_features, numerical_features,
                         categorical_features, features, MAX_LEN, create_dataset, get_results_df)

In [2]:
DATA_PATH = "./data/"
CHECKPOINT_PATH = "./checkpoints/"

BATCH_SIZE = 32

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
test_df = load_test_data(DATA_PATH)
ground_truths = test_df.loc[:, target_features].values

In [4]:
config = transformers.XLMRobertaConfig().from_pretrained(CHECKPOINT_PATH+"best_valid_loss")
model = Wd_Xlm_T.from_pretrained(CHECKPOINT_PATH+"best_valid_loss",  config = config,
                                            dim_features=len(numerical_features + features),dim_hidden=[512,256])

model.to(DEVICE)

Wd_Xlm_T(
  (roberta): XLMRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768

In [5]:
tokenizer = transformers.XLMRobertaTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")

dataset = create_dataset(test_df, tokenizer)

In [10]:
model.eval()

predictions = []

dataloader = DataLoader(dataset, batch_size = BATCH_SIZE,
                        shuffle=False, drop_last=False)

for data in tqdm(dataloader):
    input_ids = data['input_ids'].to(DEVICE)
    attention_mask = data['attention_mask'].to(DEVICE)
    features = data['features'].to(DEVICE)

    labels = data['labels'].to(DEVICE)
        
    logits = model(input_ids, attention_mask, features)
    predictions.append(torch.sigmoid(logits).detach().cpu())

  0%|          | 0/87046 [00:00<?, ?it/s]

In [15]:
predictions = [prediction.numpy() for prediction in predictions]

In [19]:
prediction_arr = np.concatenate(predictions, axis=0)
print(prediction_arr.shape)

(1392727, 4)


In [21]:
results_df = get_results_df(prediction_arr, ground_truths)

results_df

Unnamed: 0,reply,retweet,retweet_comment,like
rce,13.75169,18.732629,3.144279,13.996705
avg_prec,0.142608,0.404583,0.014493,0.669621
