In [1]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertForSequenceClassification, BertTokenizerFast
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

In [2]:
"""
 Helper function for reproducible behaviour for setting seed for:

 random, numpy, torch and/or tensorflow (if available)
"""

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)    # Safe to call this function even if, cuda not avail.

    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

set_seed(1)

In [3]:
# We are going to train: Bert-base-uncased model.
# Some "text classification models: https://huggingface.co/models?filter=text-classification"
model_name = "bert-base-uncased"

max_length = 286

- Loading the dataset.

In [4]:
# Loading the tokenizer

tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case = True)

In [5]:
def read_20newsgroups(test_size=0.2):
  # download & load 20newsgroups dataset from sklearn's repos
  dataset = fetch_20newsgroups(subset="all", shuffle=True, remove=("headers", "footers", "quotes"))
  documents = dataset.data
  labels = dataset.target
  # split into training & testing a return data as well as label names
  return train_test_split(documents, labels, test_size=test_size), dataset.target_names

# call the function
(train_texts, valid_texts, train_labels, valid_labels), target_names = read_20newsgroups()

In [9]:
subset_percentage = 0.3
subset_size = int(len(train_texts) * subset_percentage)

# Select a random subset of indices
subset_indices = random.sample(range(len(train_texts)), subset_size)

# Use the selected subset of data
train_texts_subset = [train_texts[i] for i in subset_indices]
train_labels_subset = [train_labels[i] for i in subset_indices]

train_encodings = tokenizer(train_texts_subset, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

Tokenizing the **training** and **validation** text using our preloaded BERT model.

In [7]:
'''train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length) '''

'train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)\nvalid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length) '

In [10]:
class NewsGroupsDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {k: torch.tensor(v[idx]) for k,v in self.encodings.items()}
    item["labels"] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

train_dataset = NewsGroupsDataset(train_encodings, train_labels_subset)
valid_dataset = NewsGroupsDataset(valid_encodings, valid_labels)

**Training the model**

In [11]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [13]:
pip install accelerate -U



In [14]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=400,               # log & save weights each logging_steps
    save_steps=400,
    evaluation_strategy="steps",
)

In [15]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    compute_metrics = compute_metrics,
)

In [16]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
400,2.3688,1.332972,0.615119


TrainOutput(global_step=566, training_loss=2.0773971341945257, metrics={'train_runtime': 18900.7723, 'train_samples_per_second': 0.239, 'train_steps_per_second': 0.03, 'total_flos': 664715658550944.0, 'train_loss': 2.0773971341945257, 'epoch': 1.0})

In [17]:
trainer.evaluate()

{'eval_loss': 1.3329720497131348,
 'eval_accuracy': 0.6151193633952254,
 'eval_runtime': 3944.7963,
 'eval_samples_per_second': 0.956,
 'eval_steps_per_second': 0.048,
 'epoch': 1.0}

In [18]:
model_path = "20newsgroups-bert-base-uncased"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('20newsgroups-bert-base-uncased/tokenizer_config.json',
 '20newsgroups-bert-base-uncased/special_tokens_map.json',
 '20newsgroups-bert-base-uncased/vocab.txt',
 '20newsgroups-bert-base-uncased/added_tokens.json',
 '20newsgroups-bert-base-uncased/tokenizer.json')

# **Performing Inference**

In [19]:
def get_prediction(text):
  input = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
  output = model(**input)

  probs = output[0].softmax(1)

  return target_names[probs.argmax()]

In [20]:
text = """
The first thing is first.
If you purchase a Macbook, you should not encounter performance issues that will prevent you from learning to code efficiently.
However, in the off chance that you have to deal with a slow computer, you will need to make some adjustments.
Having too many background apps running in the background is one of the most common causes.
The same can be said about a lack of drive storage.
For that, it helps if you uninstall xcode and other unnecessary applications, as well as temporary system junk like caches and old backups.
"""

print(get_prediction(text))

comp.os.ms-windows.misc


In [21]:
text = """
A black hole is a place in space where gravity pulls so much that even light can not get out.
The gravity is so strong because matter has been squeezed into a tiny space. This can happen when a star is dying.
Because no light can get out, people can't see black holes.
They are invisible. Space telescopes with special tools can help find black holes.
The special tools can see how stars that are very close to black holes act differently than other stars.
"""
print(get_prediction(text))

sci.space


In [22]:
text = """
Coronavirus disease (COVID-19) is an infectious disease caused by a newly discovered coronavirus.
Most people infected with the COVID-19 virus will experience mild to moderate respiratory illness and recover without requiring special treatment.
Older people, and those with underlying medical problems like cardiovascular disease, diabetes, chronic respiratory disease, and cancer are more likely to develop serious illness.
"""
print(get_prediction(text))

sci.med
