In [1]:
import argparse
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from small_text.integrations.transformers import TransformerModelArguments
from factories import TransformerBasedClassificationFactory
from preprocess import data_loader, preprocess_data_transformers, df_to_dict
# from learner_functions import run_multiple_experiments
from torch.utils.data import TensorDataset
import numpy as np
import torch
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader, Subset


  from .autonotebook import tqdm as notebook_tqdm


In [26]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
            return len(self.labels)
        
    def __getitem__(self, idx):
        
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
        
def _genenrate_val_indices(labels):
    indices_neg_label = np.where(labels == 0)[0]
    indices_pos_label = np.where(labels == 1)[0]
    all_indices = np.concatenate([indices_neg_label, indices_pos_label])
    np.random.shuffle(all_indices)
    x_indices_initial = all_indices.astype(int)
    y_initial = np.array([labels[i] for i in x_indices_initial])
    print(f'Starting imbalance: {np.round(np.mean(y_initial),2)}')
    print('Setting val indices')
    
    return np.concatenate([np.random.choice(indices_pos_label, 
                                                    int(0.1*len(indices_pos_label)),
                                                    replace=False),
                                    np.random.choice(indices_neg_label,
                                                    int(0.1*len(indices_neg_label)),
                                                    replace=False)
                                    ])

def parse_args():
    parser=argparse.ArgumentParser(description="Supervised Learning Experiment Runner with Transformers Integration")
    parser.add_argument('--method', type = str, metavar ="", default = 'SL', help="Supervised == SL or Active == AL")
    parser.add_argument('--framework', type = str, metavar ="", default = 'TF', help="Transformers == TF or SkLearn == SK")
    parser.add_argument('--datadir', type = str, metavar ="",default = './data/', help="Path to directory with data files")
    parser.add_argument('--dataset', type = str, metavar ="",default = 'wiki', help="Name of dataset")
    parser.add_argument('--outdir', type = str, metavar ="",default = './results/', help="Path to output directory for storing results")
    parser.add_argument('--transformer_model', type = str, metavar ="",default = 'distilbert-base-uncased', help="Name of HuggingFace transformer model")
    parser.add_argument( '--n_epochs', type = int, metavar ="",default =  3, help = "Number of epochs for model training")
    parser.add_argument('--class_imbalance', type = int, metavar ="", default = 50, help = 'Class imbalance desired in train dataset')
    parser.add_argument('--train_n', type = int, metavar ="", default = 20000, help = 'Total number of training examples')
    parser.add_argument('--test_n', type = int, metavar ="", default = 5000, help = 'Total number of testing examples')
    parser.add_argument('--run_n', type = int, metavar ="", default = 5, help = 'Number of times to run each model')
    args=parser.parse_args()
    print("the inputs are:")
    for arg in vars(args):
        print("{} is {}".format(arg, getattr(args, arg)))
    return args

In [27]:
args=parse_args()
# Load data
train_df, test_dfs = data_loader(args)

tokenizer = AutoTokenizer.from_pretrained(args.transformer_model, cache_dir='.cache/')    
tokenizer.add_special_tokens({'additional_special_tokens': ["[URL]", "[EMOJI]", "[USER]"]})
train_dict = df_to_dict('train', train_df)
# train_full = preprocess_data_transformers('train',
#                                     tokenizer,
#                                     train_dict['data'],
#                                     train_dict['target'],
#                                     train_dict['weak_target'])

the inputs are:
method is SL
framework is /Users/raymond/Library/Jupyter/runtime/kernel-v2-19591K7hAMpPiQzc4.json
datadir is ./data/
dataset is wiki
outdir is ./results/
transformer_model is distilbert-base-uncased
n_epochs is 3
class_imbalance is 50
train_n is 20000
test_n is 5000
run_n is 5




In [34]:
train_encodings = tokenizer(train_dict['data'], truncation=True, padding=True)

In [37]:
train_full = TextDataset(train_encodings, train_dict['target'])


In [38]:
val_indices = _genenrate_val_indices(train_dict['target'])
indices = np.arange(len(train_dict['target']))
val_mask = np.isin(indices, val_indices)
train_indices = indices[~val_mask]

train_dataset = Subset(train_full, train_indices)
val_dataset = Subset(train_full, val_indices)

Starting imbalance: 0.5
Setting val indices


In [4]:
indices_neg_label = np.where(train_full.y == 0)[0]
indices_pos_label = np.where(train_full.y == 1)[0]
all_indices = np.concatenate([indices_neg_label, indices_pos_label])
np.random.shuffle(all_indices)
x_indices_initial = all_indices.astype(int)
y_initial = np.array([train_full.y[i] for i in x_indices_initial])
print(f'Starting imbalance: {np.round(np.mean(y_initial),2)}')
print('Setting val indices')
val_indices = np.concatenate([np.random.choice(indices_pos_label, 
                                                int(0.1*len(indices_pos_label)),
                                                replace=False),
                                np.random.choice(indices_neg_label,
                                                int(0.1*len(indices_neg_label)),
                                                replace=False)
                                ])
indices = np.arange(x_indices_initial.shape[0])
print(indices.shape)
mask = np.isin(indices, val_indices)
print(mask.shape)

Starting imbalance: 0.5
Setting val indices
(20000,)
(20000,)


In [5]:
train = train_full[indices[~mask]]
valid = train_full[indices[mask]]
train_dataset = TensorDataset(torch.concat(train.x, dim=0), torch.Tensor(train.y))
valid_dataset = TensorDataset(torch.concat(valid.x, dim=0), torch.Tensor(valid.y))


In [39]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)  # Assuming binary classification
training_args = TrainingArguments(
    output_dir="./distilbert_classifier",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=200,
    use_mps_device=True
)

# Step 7: Create Trainer and Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.predictions.argmax(axis=1), p.label_ids)},
)

trainer.train()

# Step 8: Evaluate the model on the test set
results = trainer.evaluate()

print(f"Accuracy on the test set: {results['eval_accuracy']}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/1689 [10:04<?, ?it/s]
  0%|          | 3/1689 [00:36<5:39:35, 12.09s/it]

KeyboardInterrupt: 

In [9]:
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding



texts = ["Your text samples go here.", "Another text sample."]
labels = [0, 1]  # Replace with your actual labels

# Step 2: Tokenize the input texts
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_texts = tokenizer(texts, truncation=True, padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Step 3: Convert the tokenized texts to PyTorch tensors
input_ids = torch.tensor(tokenized_texts["input_ids"])
attention_mask = torch.tensor(tokenized_texts["attention_mask"])
labels = torch.tensor(labels)

# Step 4: Split the data into training and testing sets
train_input_ids, test_input_ids, train_attention_mask, test_attention_mask, train_labels, test_labels = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42
)

# Step 5: Define the training and testing datasets
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels)
test_dataset = torch.utils.data.TensorDataset(test_input_ids, test_attention_mask, test_labels)

# Step 6: Define the model and training arguments
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)  # Assuming binary classification
training_args = TrainingArguments(
    output_dir="./distilbert_classifier",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=200,
)

# Step 7: Create Trainer and Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.predictions.argmax(axis=1), p.label_ids)},
)

trainer.train()

# Step 8: Evaluate the model on the test set
results = trainer.evaluate()

print(f"Accuracy on the test set: {results['eval_accuracy']}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


AttributeError: 'list' object has no attribute 'keys'

In [18]:
# import torch
# from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Step 1: Load your own data (replace this with your data loading code)
# For this example, I'm assuming you have a list of text samples (texts) and corresponding labels (labels).
# Adjust accordingly based on your dataset structure.
texts = ["Your text samples go here.", "Another text sample."]
labels = [0, 1]  # Replace with your actual labels (e.g., 0 for class 0, 1 for class 1, etc.)

# Step 2: Tokenize the input texts
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_texts = tokenizer(texts, truncation=True, padding=True)

# Step 3: Convert the tokenized texts to PyTorch tensors
input_ids = torch.tensor(tokenized_texts["input_ids"])
attention_mask = torch.tensor(tokenized_texts["attention_mask"])
labels = torch.tensor(labels)

# Step 4: Split the data into training and testing sets
train_input_ids, test_input_ids, train_attention_mask, test_attention_mask, train_labels, test_labels = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42
)

# Step 5: Create DataLoader for training and testing sets
train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)


train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Step 6: Initialize DistilBERT for Sequence Classification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

# Step 7: Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Step 8: Training loop
epochs = 3
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = 'mps'
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, inputs["labels"])
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Average Loss: {average_loss}")

# Step 9: Evaluate on the test set
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=1)
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch[2].cpu().numpy())

accuracy = accuracy_score(all_labels, all_predictions)
print(f"Accuracy on the test set: {accuracy}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

[A
Epoch 1: 100%|██████████| 1/1 [00:00<00:00,  8.33it/s]


Epoch 1, Average Loss: 0.6496270298957825



[A
Epoch 2: 100%|██████████| 1/1 [00:00<00:00,  8.89it/s]


Epoch 2, Average Loss: 0.5407878756523132



Epoch 3: 100%|██████████| 1/1 [00:00<00:00, 10.60it/s]


Epoch 3, Average Loss: 0.40346309542655945



Evaluating: 100%|██████████| 1/1 [00:00<00:00, 30.92it/s]

Accuracy on the test set: 0.0





In [17]:
batch

(tensor([[ 101, 2178, 3793, 7099, 1012,  102,    0,    0]], device='mps:0'),
 tensor([[1, 1, 1, 1, 1, 1, 0, 0]], device='mps:0'),
 tensor([1], device='mps:0'))