In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("ag_news")
raw_datasets

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [2]:
raw_train_dataset = raw_datasets["train"]
print(raw_train_dataset[0])
print(raw_train_dataset.features)

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)}


In [3]:
from transformers import AutoTokenizer 

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(batch):
    return tokenizer(
        batch["text"],truncation=True,padding=True,return_tensors = "pt"
    )

tokenize_function(raw_train_dataset[:2])



{'input_ids': tensor([[  101,  2813,  2358,  1012,  6468, 15020,  2067,  2046,  1996,  2304,
          1006, 26665,  1007, 26665,  1011,  2460,  1011, 19041,  1010,  2813,
          2395,  1005,  1055,  1040, 11101,  2989,  1032,  2316,  1997, 11087,
          1011, 22330,  8713,  2015,  1010,  2024,  3773,  2665,  2153,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 18431,  2571,  3504,  2646,  3293, 13395,  1006, 26665,  1007,
         26665,  1011,  2797,  5211,  3813, 18431,  2571,  2177,  1010,  1032,
          2029,  2038,  1037,  5891,  2005,  2437,  2092,  1011, 22313,  1998,
          5681,  1032,  6801,  3248,  1999,  1996,  3639,  3068,  1010,  2038,
          5168,  2872,  1032,  2049, 29475,  2006,  2178,  2112,  1997,  1996,
          3006,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 

In [4]:
tokenized_dataset = raw_datasets.map(tokenize_function,batched = True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 7600
    })
})

In [5]:

import evaluate

accuracy = evaluate.load("accuracy")
f1_score = evaluate.load("f1")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    f1 = f1_score.compute(
        references = labels, predictions = preds, average = "weighted"
    )["f1"]
    acc = accuracy.compute(references = labels, predictions = preds)[
        "accuracy"
    ]

    return {"accuracy": acc, "f1": f1}


In [6]:
import torch 
from transformers import AutoModelForSequenceClassification

device = "cuda" if torch.cuda.is_available() else "cpu"
num_labels = 4
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = num_labels).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from transformers import TrainingArguments

batch_size = 32
training_args = TrainingArguments(
    "newsclassifer",
    num_train_epochs = 4,
    eval_strategy = "epoch",
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy="epoch"
)

In [8]:
from transformers import Trainer
shuffled_dataset = tokenized_dataset["train"].shuffle(seed = 42)
small_split = shuffled_dataset.select(range(10000))

trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = small_split,
    eval_dataset = tokenized_dataset["test"],
    tokenizer = tokenizer
)

In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.260035,0.912237,0.91204
2,0.306100,0.24039,0.919868,0.919838
3,0.306100,0.290611,0.920526,0.920447
4,0.124300,0.301658,0.920658,0.920545


TrainOutput(global_step=1252, training_loss=0.18436610736785985, metrics={'train_runtime': 419.9712, 'train_samples_per_second': 95.245, 'train_steps_per_second': 2.981, 'total_flos': 3751337310706944.0, 'train_loss': 0.18436610736785985, 'epoch': 4.0})

In [4]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("newsclassifer/checkpoint-1252")
tokenizer = AutoTokenizer.from_pretrained("newsclassifer/checkpoint-1252")

In [10]:
from huggingface_hub import HfApi, HfFolder

# Set the repository name. It will be created under your username on Hugging Face Hub.
repo_name = "News-Categorizer"

# Upload the model and tokenizer
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/rahul004/News-Categorizer/commit/856d1cdbd65078ee0370ae4c6d92d63c0720bb99', commit_message='Upload tokenizer', commit_description='', oid='856d1cdbd65078ee0370ae4c6d92d63c0720bb99', pr_url=None, pr_revision=None, pr_num=None)