In [1]:
from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

# by default encoder-attention is `block_sparse` with num_random_blocks=3, block_size=64
model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")

# decoder attention type can't be changed & will be "original_full"
# you can change `attention_type` (encoder only) to full attention like this:
model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv", attention_type="original_full")

# you can change `block_size` & `num_random_blocks` like this:
model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv", block_size=16, num_random_blocks=2)

text = "Replace me by any text you'd like."
inputs = tokenizer(text, return_tensors='pt')
prediction = model.generate(**inputs)
prediction = tokenizer.batch_decode(prediction)

Attention type 'block_sparse' is not possible if sequence_length: 11 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 144 with config.block_size = 16, config.num_random_blocks = 2.Changing attention type to 'original_full'...


In [2]:
prediction

['<s> we present a new method for the generation of non - abelian gauge fields from a non - abelian gauge field.<n> we show that the non - abelian gauge field can be efficiently generated from a non - abelian gauge field.<n> the method is based on the use of a non - linear coupling between the non - abelian and the abelian gauge field.<n> the non - abelian gauge field can be efficiently generated from a non - abelian gauge field. <n> @xmath0 non - abelian gauge field.<n> @xmath1 non - abelian gauge field.<n> @xmath2 non - abelian gauge field.<n> @xmath3 non - abelian gauge field.<n> @xmath4 non - abelian gauge field.<n> @xmath5 non - abelian gauge field.<n> @xmath6 non - abelian gauge field.<n> @xmath7 non - abelian gauge field.<n> @xmath8 non - abelian gauge field.<n> @xmath9 non - abelian gauge field.<n> @xmath10 non']

In [9]:
import torch

from datasets import load_dataset

train_dataset = load_dataset('csv', data_files={'train':'ArxivData/train.csv', 'val' : 'ArxivData/val.csv'})
#val_dataset = load_dataset('csv', data_files='ArxivData/val.csv')
#test_dataset = load_dataset('csv', data_files='ArxivData/test.csv')

Using custom data configuration default-4f4b4b5db5d9e3d9


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to C:\Users\user\.cache\huggingface\datasets\csv\default-4f4b4b5db5d9e3d9\0.0.0\e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23...


0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to C:\Users\user\.cache\huggingface\datasets\csv\default-4f4b4b5db5d9e3d9\0.0.0\e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23. Subsequent calls will reuse this data.


In [13]:
train_dataset['train'][0]

{'abstract': 'we consider the problem of utility maximization for investors with power utility functions. building on the earlier work larsen et al. (2016), we prove that the value of the problem is a frechet-differentiable function of the drift of the price process, provided that this drift lies in a suitable banach space.   we then study optimal investment problems with non-markovian driving processes. in such models there is no hope to get a formula for the achievable maximal utility. applying results of the first part of the paper we provide first order expansions for certain problems involving fractional brownian motion either in the drift or in the volatility. we also point out how asymptotic results can be derived for models with strong mean reversion.',
 'title': 'on optimal investment with processes of long or negative memory'}

In [11]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./arxivOutput',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset["train"],         # training dataset
    eval_dataset=train_dataset['val']             # evaluation dataset
)

trainer.train()

IndexError: Invalid key: 60664 is out of bounds for size 0

In [23]:
import pandas as pd

train_df = pd.read_csv('ArxivData/train.csv')
val_df = pd.read_csv('ArxivData/val.csv')
test_df = pd.read_csv('ArxivData/test.csv')

In [21]:
train_encodings = tokenizer(list(train_df['abstract']), truncation=True, padding=True)

In [None]:
train_encodings = tokenizer(list(train_df['abstract']), truncation=True, padding=True)
train_labels_encoding = tokenizer(list(train_df['title']), truncation=True, padding=True)
val_encodings = tokenizer(list(val_df['abstract']), truncation=True, padding=True)
val_labels_encodings = tokenizer(list(val_df['title']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_df['abstract']), truncation=True, padding=True)

In [None]:
import torch

class ArxivDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ArxivDataset(train_encodings, train_labels)
val_dataset = ArxivDataset(val_encodings, val_labels)
#test_dataset = ArxivDataset(test_encodings, test_labels)

In [17]:
train_df

Unnamed: 0,abstract,title
0,we consider the problem of utility maximizatio...,on optimal investment with processes of long o...
1,in this paper we provide an explicit formula f...,boolean complexes for ferrers graphs
2,"kinesin-5, also known as eg5 in vertebrates is...",relative velocity of sliding of microtubules b...
3,we discuss the transition paths in a coupled b...,bifurcation of transition paths induced by cou...
4,two types of room temperature detectors of ter...,all-electric detectors of the polarization sta...
...,...,...
124995,scaling temporal dynamics in functional mri (f...,scale-free and multifractal time dynamics of f...
124996,in order to understand the role of space in ec...,the role of space in the exploitation of resou...
124997,we perform differential expression analysis of...,bnp-seq: bayesian nonparametric differential e...
124998,"in this short note, we first present a simple ...",a simple bijection between binary trees and co...
