In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
import re
import torch
from torch.utils.data import DataLoader, random_split
from avalanche.benchmarks import nc_benchmark


# Load dataset
dataset = load_dataset("SetFit/20_newsgroups")



In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 11314
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 7532
    })
})

In [None]:
# Assuming dataset is a DatasetDict containing train and test datasets
# Iterate over the train dataset to count label occurrences
train_label_counts = {}
for example in dataset['train']:
    label = example['label']
    if label in train_label_counts:
        train_label_counts[label] += 1
    else:
        train_label_counts[label] = 1

# Iterate over the test dataset to count label occurrences
test_label_counts = {}
for example in dataset['test']:
    label = example['label']
    if label in test_label_counts:
        test_label_counts[label] += 1
    else:
        test_label_counts[label] = 1

# Sort train label counts by label number
sorted_train_label_counts = sorted(train_label_counts.items(), key=lambda x: x[0])

# Print sorted train label distributions
print("Train Label Distribution:")
for label, count in sorted_train_label_counts:
    print(f"Label {label}: {count} samples")

# Sort test label counts by label number
sorted_test_label_counts = sorted(test_label_counts.items(), key=lambda x: x[0])

# Print sorted test label distributions
print("\nTest Label Distribution:")
for label, count in sorted_test_label_counts:
    print(f"Label {label}: {count} samples")



Train Label Distribution:
Label 0: 480 samples
Label 1: 584 samples
Label 2: 591 samples
Label 3: 590 samples
Label 4: 578 samples
Label 5: 593 samples
Label 6: 585 samples
Label 7: 594 samples
Label 8: 598 samples
Label 9: 597 samples
Label 10: 600 samples
Label 11: 595 samples
Label 12: 591 samples
Label 13: 594 samples
Label 14: 593 samples
Label 15: 599 samples
Label 16: 546 samples
Label 17: 564 samples
Label 18: 465 samples
Label 19: 377 samples

Test Label Distribution:
Label 0: 319 samples
Label 1: 389 samples
Label 2: 394 samples
Label 3: 392 samples
Label 4: 385 samples
Label 5: 395 samples
Label 6: 390 samples
Label 7: 396 samples
Label 8: 398 samples
Label 9: 397 samples
Label 10: 399 samples
Label 11: 396 samples
Label 12: 393 samples
Label 13: 396 samples
Label 14: 394 samples
Label 15: 398 samples
Label 16: 364 samples
Label 17: 376 samples
Label 18: 310 samples
Label 19: 251 samples


In [None]:
first_train_text = dataset['train']['text'][2]
print(first_train_text)

well folks, my mac plus finally gave up the ghost this weekend after
starting life as a 512k way back in 1985.  sooo, i'm in the market for a
new machine a bit sooner than i intended to be...

i'm looking into picking up a powerbook 160 or maybe 180 and have a bunch
of questions that (hopefully) somebody can answer:

* does anybody know any dirt on when the next round of powerbook
introductions are expected?  i'd heard the 185c was supposed to make an
appearence "this summer" but haven't heard anymore on it - and since i
don't have access to macleak, i was wondering if anybody out there had
more info...

* has anybody heard rumors about price drops to the powerbook line like the
ones the duo's just went through recently?

* what's the impression of the display on the 180?  i could probably swing
a 180 if i got the 80Mb disk rather than the 120, but i don't really have
a feel for how much "better" the display is (yea, it looks great in the
store, but is that all "wow" or is it really th

In [None]:
# Define function for text preprocessing
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove email IDs
    text = re.sub(r'\S*@\S*\s?', '', text)

    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply preprocessing to the entire dataset
for split in dataset.keys():
    dataset[split] = dataset[split].map(lambda example: {'text': preprocess_text(example['text'])})

# Example of accessing preprocessed data
print(dataset['train']['text'][2])


well folks my mac plus finally gave up the ghost this weekend after starting life as a 512k way back in 1985 sooo im in the market for a new machine a bit sooner than i intended to be im looking into picking up a powerbook 160 or maybe 180 and have a bunch of questions that hopefully somebody can answer does anybody know any dirt on when the next round of powerbook introductions are expected id heard the 185c was supposed to make an appearence this summer but havent heard anymore on it and since i dont have access to macleak i was wondering if anybody out there had more info has anybody heard rumors about price drops to the powerbook line like the ones the duos just went through recently whats the impression of the display on the 180 i could probably swing a 180 if i got the 80Mb disk rather than the 120 but i dont really have a feel for how much better the display is yea it looks great in the store but is that all wow or is it really that good could i solicit some opinions of people w

In [None]:
from torch.utils.data.dataset import TensorDataset
from avalanche.benchmarks.utils import AvalancheDataset

# Step 1: Extract text and labels from your dataset
texts_train = [example['text'] for example in dataset['train']]
labels_train = [example['label'] for example in dataset['train']]

texts_test = [example['text'] for example in dataset['test']]
labels_test = [example['label'] for example in dataset['test']]


In [None]:
def load_vocab(vocab_file):
    vocab = {}
    with open(vocab_file, 'r', encoding='utf-8') as f:
        for index, token in enumerate(f.readlines()):
            vocab[token.strip()] = index
    return vocab

vocab_file = 'vocab.txt'
vocab = load_vocab(vocab_file)

In [None]:
class CustomTokenizer:
    def __init__(self, vocab, max_length):
        self.vocab = vocab
        self.max_length = max_length
        self.init_special_tokens()

    def init_special_tokens(self):
        self.cls_token_id = self.vocab.get("[CLS]", 101)
        self.sep_token_id = self.vocab.get("[SEP]", 102)
        self.unk_token_id = self.vocab.get("[UNK]", 100)
        self.pad_token_id = self.vocab.get("[PAD]", 0)

    def tokenize(self, text):
        # Convert text to lowercase for uncased model
        text = text.lower()
        tokens = []
        for word in text.split():
            word_tokens = self.tokenize_word(word)
            if not word_tokens:
                tokens.append('[UNK]')
            else:
                tokens.extend(word_tokens)
        return tokens

    def tokenize_word(self, word):
        """Attempts to break down a word into the longest possible subwords known to the vocab."""
        if word in self.vocab:
            return [word]

        subwords = []
        while word:
            subword = self.find_longest_subword(word)
            if subword:
                subwords.append(subword)
                word = word[len(subword):]
                if word and word not in self.vocab:  # Add '##' prefix to the remaining subwords
                    word = '##' + word
            else:
                return []  # Return empty if no subword is found, to be replaced by [UNK]
        return subwords

    def find_longest_subword(self, word):
        for i in range(len(word), 0, -1):
            if word[:i] in self.vocab:
                return word[:i]
        return None  # No subword found

    def encode(self, text):
        tokens = self.tokenize(text)
        token_ids = [self.vocab.get(token, self.unk_token_id) for token in tokens]
        token_ids = [self.cls_token_id] + token_ids + [self.sep_token_id]  # Add special tokens
        return self.pad_or_truncate(token_ids)

    def pad_or_truncate(self, token_ids):
        """Truncate the sequence to the specified max_length without adding padding."""
        # Truncate if the sequence is too long
        if len(token_ids) > self.max_length:
            token_ids = token_ids[:self.max_length]
            # Ensure that the last token is always [SEP] if truncated
            token_ids[-1] = self.sep_token_id
        # No padding is added if the sequence is shorter than max_length
        return token_ids

    def __call__(self, texts, return_tensors='pt'):
        all_encoded_ids = [self.encode(text) for text in texts]
        if return_tensors == 'pt':
            import torch
            return torch.tensor(all_encoded_ids)
        else:
            return all_encoded_ids


In [None]:
max_length = 512  # Maximum sequence length
custom_tokenizer = CustomTokenizer(vocab, max_length)
# Comparing outputs
from transformers import BertTokenizer

# Load BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text_example = texts_train[4]
# Assuming custom_tokenizer is already created and max_length is set
custom_encoded_text = custom_tokenizer([text_example], return_tensors='pt')

# Tokenize with BERT tokenizer, using matching max_length and other parameters for fairness
bert_encoded_text = bert_tokenizer([text_example], padding=True, max_length=max_length, return_tensors='pt')

In [None]:
text_example



In [None]:
print("Custom Tokenizer Encoded Text:\n", custom_encoded_text)

Custom Tokenizer Encoded Text:
 tensor([[  101,  2013,  3720,  2011,  3419,  1037,  6243,  2026,  4824,  2003,
          2008,  1996,  3517, 10697,  2024, 10468,  2124, 12883,  1999,  1996,
          5432,  2291,  4007,  2477,  2024,  7039,  2008,  2123,  1056,  2031,
          1996,  2157,  5300,  1999,  2664,  2138,  2027,  4995,  1056,  2275,
          6229,  2044,  4888,  1998,  2107,  2066,  2738,  2084,  8081,  1996,
          3642,  1998,  4298,  8970,  2047, 12883,  2027,  2074,  2425,  1996,
          3626,  7929,  2065,  2017,  2156,  1037,  5432,  2053, 19883,  2077,
          6336,  2125,  8568,  2009,   102]])


In [None]:
print("\nBERT Tokenizer Encoded Text (input_ids):\n", bert_encoded_text['input_ids'])


BERT Tokenizer Encoded Text (input_ids):
 tensor([[  101,  2013,  3720,  2011,  3419,  1037,  6243,  2026,  4824,  2003,
          2008,  1996,  3517, 10697,  2024, 10468,  2124, 12883,  1999,  1996,
          5432,  2291,  4007,  2477,  2024,  7039,  2008,  2123,  2102,  2031,
          1996,  2157,  5300,  1999,  2664,  2138,  2027,  4995,  2102,  2275,
          6229,  2044,  4888,  1998,  2107, 10359,  2738,  2084,  8081,  1996,
          3642,  1998,  4298,  8970,  2047, 12883,  2027,  2074,  2425,  1996,
          3626,  7929,  2065,  2017,  2156,  1037,  5432,  2053, 19883,  2077,
          6336,  7245,  8568,  2009,   102]])


In [None]:
# Tokenize text data
encoded_texts_train = custom_tokenizer(texts_train, return_tensors='pt')
encoded_texts_test = custom_tokenizer(texts_test, return_tensors='pt')

# Convert labels to tensors
labels_train_tensor = torch.tensor(labels_train)
labels_test_tensor = torch.tensor(labels_test)

# Create TensorDataset
train_dataset = TensorDataset(encoded_texts_train, labels_train_tensor)
test_dataset = TensorDataset(encoded_texts_test, labels_test_tensor)

KeyboardInterrupt: 

In [None]:
# from transformers import BertTokenizer


# # Step 2: Tokenize text data
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# encoded_texts_train = tokenizer(texts_train, padding=True, truncation=True, return_tensors='pt')
# encoded_texts_test = tokenizer(texts_test, padding=True, truncation=True, return_tensors='pt')

# # Step 3: Convert labels to tensors
# labels_train_tensor = torch.tensor(labels_train)
# labels_test_tensor = torch.tensor(labels_test)

# # Step 4: Create TensorDataset
# train_dataset = TensorDataset(encoded_texts_train['input_ids'], encoded_texts_train['attention_mask'], labels_train_tensor)
# test_dataset = TensorDataset(encoded_texts_test['input_ids'], encoded_texts_test['attention_mask'], labels_test_tensor)


In [None]:
avl_train_data = AvalancheDataset(train_dataset)
avl_test_data = AvalancheDataset(test_dataset)

In [None]:
avl_train_data.targets = labels_train
avl_test_data.targets = labels_test

In [None]:
benchmark = nc_benchmark(
    test_dataset=avl_test_data,  # Your Avalanche dataset
    train_dataset=avl_train_data,
    n_experiences=5,  # Number of experiences
    task_labels=True  # Indicate that you have task labels for each experience
)


In [None]:
train_stream = benchmark.train_stream
experience = train_stream[0]

# task label and dataset are the main attributes
t_label = experience.task_label
dataset = experience.dataset

# but you can recover additional info
print(experience.current_experience)
print(experience.classes_in_this_experience)
print(experience.classes_seen_so_far)
print(experience.previous_classes)
print(experience.future_classes)
print(experience.origin_stream)
print(experience.benchmark)