In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,  pipeline


In [3]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()
print_gpu_utilization()

GPU memory occupied: 2308 MB.


In [4]:
bert_link = 'bert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(bert_link)
print_gpu_utilization()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

GPU memory occupied: 2733 MB.


In [7]:
import pandas as pd
from datasets import Dataset, DatasetDict


train_dataset = pd.read_csv('Fakeddit/train1.tsv', sep = '\t')
validate_dataset = pd.read_csv('Fakeddit/all_validate.tsv', sep = '\t')
training_featues = train_dataset[['title', '2_way_label']].dropna()[:10000]
valid_feautres = validate_dataset[['title', '2_way_label']].dropna()

training_featues_dataset = Dataset.from_pandas(training_featues)
training_featues_dataset = training_featues_dataset.rename_column('2_way_label', 'labels')
training_featues_dataset = training_featues_dataset.rename_column('title', 'text')


valid_feautres_dataset = Dataset.from_pandas(valid_feautres)
valid_feautres_dataset = valid_feautres_dataset.rename_column('2_way_label', 'labels')
valid_feautres_dataset = valid_feautres_dataset.rename_column('title', 'text')


dataset_dict = DatasetDict({'train': training_featues_dataset, 'validation': valid_feautres_dataset})



In [8]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset_dict.map(tokenize_function, batched=True)
tokenized_dataset =tokenized_dataset.remove_columns(['text'])
tokenized_dataset.with_format("torch")



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/84721 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 84721
    })
})

In [10]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [9]:
import torch


torch.ones((1, 1)).to("cuda")
print_gpu_utilization()


GPU memory occupied: 2717 MB.


In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer")

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
   )


In [12]:
result = trainer.train()



  0%|          | 0/3750 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.409, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}
{'loss': 0.3596, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}
{'loss': 0.3048, 'learning_rate': 3e-05, 'epoch': 1.2}
{'loss': 0.2424, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}
{'loss': 0.2463, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}
{'loss': 0.1247, 'learning_rate': 1e-05, 'epoch': 2.4}
{'loss': 0.1154, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.8}
{'train_runtime': 419.5689, 'train_samples_per_second': 71.502, 'train_steps_per_second': 8.938, 'train_loss': 0.24640703684488932, 'epoch': 3.0}


In [14]:
from sklearn.metrics import accuracy_score
import numpy as np
valid_preds = trainer.predict(tokenized_dataset["validation"])
training_preds = trainer.predict(tokenized_dataset["train"])

training_preds = np.argmax(training_preds.predictions, axis=-1)
valid_preds = np.argmax(valid_preds.predictions, axis=-1)

training_acc = accuracy_score(training_preds, tokenized_dataset["train"]['labels'])
valid_acc = accuracy_score(valid_preds, tokenized_dataset["validation"]['labels'])

print(f"Training accuracy: {training_acc}")
print(f"Validation accuracy: {valid_acc}")


  0%|          | 0/1250 [00:00<?, ?it/s]

Training accuracy: 0.9881
Validation accuracy: 0.8802776171197224
