In [11]:
import os
os.path.join(finetuning_config["save_path"], "finetuned_model")

'/data/language_models/pretrained_models_downstreaming/stanford_imdb/electra_small_discriminator_document_predictions/finetuned_model'

In [12]:
from datasets import load_dataset
import torch
from finlm.models import ElectraDocumentClassification
from finlm.dataset import FinetuningDocumentDataset
import re
import os
import json

finetuning_model_path = "/data/language_models/pretrained_models_downstreaming/stanford_imdb/electra_small_discriminator_document_predictions/finetuning_config.json"
with open(finetuning_model_path, "r") as file:
    finetuning_config = json.load(file)

model_loader = lambda model_path, num_labels, classifier_dropout: ElectraDocumentClassification.from_pretrained(model_path, num_labels = num_labels, classifier_dropout = classifier_dropout, num_sequence_attention_heads = 1) 

if not(torch.cuda.is_available()):
    print("GPU seems to be unavailable.")
else:
    device = torch.device("cuda")

# Load the dataset
dataset = load_dataset("stanfordnlp/imdb")

# Split the dataset into training and test data
training_data = dataset["train"]
test_data = dataset["test"]

# datasets must be shuffled, because they are sorted by label
training_data = training_data.shuffle(42)
test_data = test_data.shuffle(42)

training_documents, training_labels = [], []
for sample in training_data:
    training_documents.append(sample["text"])
    training_labels.append(sample["label"])

test_documents, test_labels = [], []
for sample in test_data:
    test_documents.append(sample["text"])
    test_labels.append(sample["label"])

training_documents = [re.split(r'(?<=[.!?]) +', doc) for doc in training_documents]
test_documents = [re.split(r'(?<=[.!?]) +', doc) for doc in test_documents]

training_dataset = FinetuningDocumentDataset(documents = training_documents, labels = training_labels, tokenizer_path = finetuning_config["tokenizer_path"], sequence_length = finetuning_config["max_sequence_length"])
test_dataset = FinetuningDocumentDataset(documents = test_documents, labels = test_labels, tokenizer_path = finetuning_config["tokenizer_path"], sequence_length = finetuning_config["max_sequence_length"])

model = model_loader(
    os.path.join(finetuning_config["save_path"], "finetuned_model"),
    finetuning_config["num_labels"],
    0.0
)

In [30]:
from torch.utils.data import DataLoader
from finlm.dataset import collate_fn_fixed_sequences

collate_fn = lambda x: collate_fn_fixed_sequences(x, max_sequences = finetuning_config["max_sequences"])
training_data = DataLoader(training_dataset, 1, shuffle = False, collate_fn = collate_fn)

In [86]:
for i, batch in enumerate(training_data):
    if i == 0:
        break

inputs, attention_mask, labels, sequence_mask = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["label"].to(device), batch["sequence_mask"].to(device)
model.to(device)

with torch.no_grad():
    model_output = model(input_ids = inputs, attention_mask = attention_mask, sequence_mask = sequence_mask, labels = labels)

In [87]:
attention_aggregate = model_output.attentions[0, 0, :, :].sum(dim = 0).cpu().numpy()
attention_aggregate

array([3.17128  , 4.6234665, 7.7983274, 2.3347297, 1.9695232, 1.3553371,
       1.2834724, 1.3471363, 1.2095077, 1.414024 , 3.8016944, 1.6915021,
       0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
       0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
       0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
       0.       , 0.       ], dtype=float32)

In [88]:
import numpy as np
sorted_index = np.flip(attention_aggregate.argsort())
attention_aggregate[sorted_index]

array([7.7983274, 4.6234665, 3.8016944, 3.17128  , 2.3347297, 1.9695232,
       1.6915021, 1.414024 , 1.3553371, 1.3471363, 1.2834724, 1.2095077,
       0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
       0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
       0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
       0.       , 0.       ], dtype=float32)

In [89]:
training_documents[i]

['There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes.',
 'Profiler looks crispy, Fortier looks classic.',
 'Profiler plots are quite simple.',
 "Fortier's plot are far more complicated...",
 'Fortier looks more like Prime Suspect, if we have to spot similarities...',
 'The main character is weak and weirdo, but have "clairvoyance".',
 'People like to compare, to judge, to evaluate.',
 'How about just enjoying?',
 'Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!).',
 "Maybe it's the language, or the spirit, but I think this series is more English than American.",
 'By the way, the actors are really good and funny.',
 'The acting is not superficial at all...']

In [90]:
[training_documents[i][idx] for idx in sorted_index[:len(training_documents[i])]]

['Profiler plots are quite simple.',
 'Profiler looks crispy, Fortier looks classic.',
 'By the way, the actors are really good and funny.',
 'There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes.',
 "Fortier's plot are far more complicated...",
 'Fortier looks more like Prime Suspect, if we have to spot similarities...',
 'The acting is not superficial at all...',
 "Maybe it's the language, or the spirit, but I think this series is more English than American.",
 'The main character is weak and weirdo, but have "clairvoyance".',
 'How about just enjoying?',
 'People like to compare, to judge, to evaluate.',
 'Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!).']