## TEXT CLASSIFICATION USING BERT - HUGGINGFACE 

In [1]:
# Importing the libraries
import torch
from datasets import load_dataset, load_metric
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from transformers import  DataCollatorWithPadding, AdamW, get_scheduler
from torch.utils.data import DataLoader

'''
Library functions
load_dataset:
    Loads the dataset and returns the train and test data
load_metric:
    Loads the metric and returns the metric
BerTokenizer:
    Loads BertTokenizer for a given model id or config
BertModel:
    Loads BertModel for a given model id or config
BertForSequenceClassification:
    Loads a SequentialClassifcation headed BertModel
DataCollatorWithPadding:
    DataCollator batches data for training and validatation here we pad the sequences in the batch
AdamW:
    Adam optimizer with weight decay
get_scheduler:
    Returns a learning rate scheduler
DataLoader:
    Loads the data in batches'''


'\nLibrary functions\nload_dataset:\n    Loads the dataset and returns the train and test data\nload_metric:\n    Loads the metric and returns the metric\nBerTokenizer:\n    Loads BertTokenizer for a given model id or config\nBertModel:\n    Loads BertModel for a given model id or config\nBertForSequenceClassification:\n    Loads a SequentialClassifcation headed BertModel\nDataCollatorWithPadding:\n    DataCollator batches data for training and validatation here we pad the sequences in the batch\nAdamW:\n    Adam optimizer with weight decay\nget_scheduler:\n    Returns a learning rate scheduler\nDataLoader:\n    Loads the data in batches'

In [3]:
# Loading the dataset

def load_req_dataset(dataset_id):
    '''
    Parameters:
        dataset_id: The id of the dataset to be loaded
    Returns:
        dataset: The Dataset Object of the required dataset'''
    dataset = load_dataset(dataset_id)
    return dataset
dataset = load_req_dataset("emotion")

Downloading builder script: 3.62kB [00:00, 3.61MB/s]                   
Downloading metadata: 3.28kB [00:00, 3.28MB/s]                   
Using custom data configuration default


Downloading and preparing dataset emotion/default (download: 1.97 MiB, generated: 2.07 MiB, post-processed: Unknown size, total: 4.05 MiB) to C:\Users\Vasanth\.cache\huggingface\datasets\emotion\default\0.0.0\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705...


Downloading data: 100%|██████████| 1.66M/1.66M [00:00<00:00, 5.06MB/s]
Downloading data: 100%|██████████| 204k/204k [00:00<00:00, 1.62MB/s]
Downloading data: 100%|██████████| 207k/207k [00:00<00:00, 1.52MB/s]
                                                                                       

Dataset emotion downloaded and prepared to C:\Users\Vasanth\.cache\huggingface\datasets\emotion\default\0.0.0\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 125.03it/s]


In [7]:
# Tokenizing the dataset

def tokenize_dataset(sentence):
    '''
    Parameters:
        dataset: The Dataset Object of the required dataset
        tokenizer_checkpoint: The path to the tokenizer checkpoint
    Returns:
        tokenizer: The tokenizer object'''
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    tokenized_sentence = tokenizer(sentence["text"], padding=True, truncation=True)
    return tokenized_sentence
tokenized_dataset = dataset.map(tokenize_dataset)
print(tokenized_dataset)

  0%|          | 4/16000 [00:36<40:46:25,  9.18s/ex]


KeyboardInterrupt: 