In [1]:
from datasets import load_dataset

dataset = load_dataset("setfit/20_newsgroups")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import re

# # Load tokenizer
# tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

# def preprocess_text(text):
#     # Lowercase the text
#     text = text.lower()
#     # Remove URLs
#     text = re.sub(r'http\S+|www.\S+', ' ', text)
#     # Remove emails
#     text = re.sub(r'\S*@\S*\s?', ' ', text)
#     # Remove special characters (keeping letters, numbers, and basic punctuation)
#     text = re.sub(r'[^a-z0-9,.!? ]', ' ', text)
#     return text

# def preprocess_data(examples):
#     # Apply text cleaning and preprocessing
#     examples["text"] = [preprocess_text(text) for text in examples["text"]]
#     # Tokenize the texts
#     return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

# # Preprocess the dataset
# tokenized_datasets = dataset.map(preprocess_data, batched=True)

In [2]:
import torch
from torch.utils.data import Dataset
from avalanche.benchmarks.utils import AvalancheDataset
from transformers import BertTokenizer
import re

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        """
        Args:
            texts (list of str): List of text samples.
            labels (list of int): List of labels corresponding to the text samples.
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.texts)
    
    @staticmethod
    def preprocess_text(text):
        # Lowercase the text
        text = text.lower()
        # Remove URLs
        text = re.sub(r'http\S+|www.\S+', ' ', text)
        # Remove emails
        text = re.sub(r'\S*@\S*\s?', ' ', text)
        # Remove special characters (keeping letters, numbers, and basic punctuation)
        text = re.sub(r'[^a-z0-9,.!? ]', ' ', text)
        return text

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text (you could add truncation and padding as needed)
        text = TextDataset.preprocess_text(text) 
        encoded_text = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

        return encoded_text, label


train_text = dataset['train']['text']  
train_labels = dataset['train']['label']  

test_text = dataset['test']['text']
test_labels = dataset['test']['label']

# Create the Dataset
train_data = TextDataset(train_text, train_labels)
test_data = TextDataset(test_text, test_labels)

# Wrap with AvalancheDataset
avl_train_data = AvalancheDataset(train_data)

avl_test_data = AvalancheDataset(test_data)


avl_train_data.targets = train_labels
avl_test_data.targets = test_labels

  warn(
  avl_train_data = AvalancheDataset(train_data)
  avl_test_data = AvalancheDataset(test_data)


In [3]:
from avalanche.benchmarks.generators import nc_benchmark


# Create a class-incremental benchmark with 5 experiences, each introducing new classes
benchmark = nc_benchmark(
    test_dataset=avl_test_data,  # Your Avalanche dataset
    train_dataset=avl_train_data,
    n_experiences=5,  # Number of experiences
    task_labels=True  # Indicate that you have task labels for each experience
)

train_stream = benchmark.train_stream
experience = train_stream[0]

# task label and dataset are the main attributes
t_label = experience.task_label
dataset = experience.dataset

# but you can recover additional info
print(experience.current_experience)
print(experience.classes_in_this_experience)
print(experience.classes_seen_so_far)
print(experience.previous_classes)
print(experience.future_classes)
print(experience.origin_stream)
print(experience.benchmark)

0
[9, 2, 3, 13]
[9, 2, 3, 13]
[]
[0, 1, 4, 5, 6, 7, 8, 10, 11, 12, 14, 15, 16, 17, 18, 19]
<avalanche.benchmarks.scenarios.new_classes.nc_scenario.NCStream object at 0x000001DA2DEBFFD0>
<avalanche.benchmarks.scenarios.new_classes.nc_scenario.NCScenario object at 0x000001DA7ECD28C0>


In [7]:
for exp in train_stream:
    current = exp.dataset
    print(exp.classes_in_this_experience)
    print()
    print(len(current))
    print(current[765])
    break

[0, 9, 1, 15]

2260
[{'input_ids': tensor([[  101,  1045,  2069,  3236,  1996,  5725,  2203,  1997,  2023,  2028,
          2006, 10978,  1012,  2515,  3087,  2031,  1037,  3189,  1029,  2298,
          2012,  2035,  2008,  5572,  2140,   999,   999,   999,   999,  1038,
         19738,  2232,   999,   999,   999,   999,   999,   999,   999,   999,
           999,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,  