In [None]:
! pip install transformers datasets accelerate evaluate



# Configurations
configuration parameters for a sentiment analysis model using the Hugging Face Transformers library and PyTorch

In [None]:
import torch

class Configuration:
  DATASET_ID = "emad12/stock_tweets_sentiment"
  MODEL_CKPT = "distilbert-base-uncased"
  SRC_COLUMN = "tweet"
  TGT_COLUMN = "sentiment"
  TEST_SIZE = 0.2
  SEED = 0
  MAX_LEN = 32
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
  LABEL2ID  = {"NEUTRAL":0, "NEGATIVE": 2 , "POSITIVE": 1 }
  ID2LABEL  = {0 : "NEUTRAL", 2: "NEGATIVE" , 1: "POSITIVE"}
  EVAL_METRIC = "accuracy"
  MODEL_OUT_DIR = "samiradh/SentiModBERT: Text Classification Pipeline | BERT | modular code | integration with the Hugging Face Model Hub"
  NUM_EPOCHS = 3
  LR = 2E-5
  BATCH_SIZE = 16
  WEIGHT_DECAY = 0.01
  EVAL_STRATEGY = "epoch"
  SAVE_STRATEGY = "epoch"
  LOGGING_STRATEGY = "epoch"
  PUSH_TO_HUB = True

# instantiation
config = Configuration()

# DataSet
creating and preprocessing a text classification dataset using the Hugging Face Transformers library and the datasets library.

In [None]:
from transformers import AutoTokenizer , AutoModelForAudioClassification , Trainer ,TrainingArguments
from datasets import Dataset , load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

class TextClassificationDataSet :
  def __init__(self) :
      # Initializes the class and sets up essential attributes
      self.dataset_id = config.DATASET_ID
      self.model_ckpt = config.MODEL_CKPT
      self.src_column = config.SRC_COLUMN
      self.tgt_column = config.TGT_COLUMN
      self.test_size = config.TEST_SIZE
      self.seed = config.SEED
      self.max_len = config.MAX_LEN
      self.tokenizer = AutoTokenizer.from_pretrained(self.model_ckpt)

  def create_data(self) :
      # Creates and prepares the text classification dataset
      self.data = load_dataset(self.dataset_id , split = "train")
      self.df = self.data.to_pandas()
      self.df = self.df[[self.src_column, self.tgt_column]]
      self.df[self.tgt_column] = self.df[self.tgt_column].apply( lambda x: 2 if x==-1 else x )
      self.df[self.src_column] = self.df[self.src_column].apply( lambda x: x.lower() )
      self.df = self.df.sample(20000)
      self.train_df, self.test_df = train_test_split(self.df,test_size=self.test_size,shuffle= True,random_state=self.seed,stratify=self.df[self.tgt_column])
      self.train_data = Dataset.from_pandas(self.train_df)
      self.test_data = Dataset.from_pandas(self.test_df)
      return self.train_data, self.test_data

  def tokenizer_function (self, exp):
    #Tokenizes a single example from the dataset.
    model_inp = self.tokenizer(exp[self.src_column], truncation=True, padding=True, max_length=self.max_len)
    labels = torch.tensor(exp[self.tgt_column], dtype=torch.int)
    model_inp["labels"] = labels
    return model_inp

  def preprocesser_function(self,data):
    # Applies the tokenizer_function to the entire dataset.
    model_input = data.map(self.tokenizer_function, batched=True , remove_columns=data.column_names)
    return model_input

  def gen_classification_dataset(self):
    #Generates the tokenized and preprocessed training and testing datasets.
    train_data, test_data = self.create_data()
    train_toknized_data = self.preprocesser_function(train_data)
    test_toknized_data = self.preprocesser_function(test_data)
    return train_toknized_data,test_toknized_data

# Model Trainer

In [None]:
import torch
from transformers import AutoTokenizer , AutoModelForSequenceClassification , Trainer ,TrainingArguments , DataCollatorWithPadding
import evaluate
import numpy as np


class TextClassificationModelTrainer :
  def __init__(self, train_data, test_data):
    # Initializes the class and sets up essential attributes for training a text classification model.
    self.train_data = train_data
    self.test_data = test_data
    self.model_ckpt = config.MODEL_CKPT
    self.id2label = config.ID2LABEL
    self.label2id = config.LABEL2ID
    self.num_labels = len(self.id2label)
    self.device = config.DEVICE
    self.eval_metric = config.EVAL_METRIC
    self.model_out_dir = config.MODEL_OUT_DIR
    self.num_epochs = config.NUM_EPOCHS
    self.lr = config.LR
    self.batch_size = config.BATCH_SIZE
    self.weight_decay = config.WEIGHT_DECAY
    self.eval_strategy = config.EVAL_STRATEGY
    self.save_strategy = config.SAVE_STRATEGY
    self.logging_strategy = config.LOGGING_STRATEGY
    self.push_to_hub =config.PUSH_TO_HUB
    self.model = AutoModelForSequenceClassification.from_pretrained (
                                                                     self.model_ckpt,
                                                                     id2label = self.id2label,
                                                                     label2id = self.label2id,
                                                                     num_labels = self.num_labels
                                                                     ).to(self.device)
    self.tokenizer = AutoTokenizer.from_pretrained(self.model_ckpt)
    self.eval_metric_computer = evaluate.load(self.eval_metric)
    self.data_collator = DataCollatorWithPadding(self.tokenizer)

  def compute_metrics(self, eval_pred):
      # Computes evaluation metrics based on the predictions and labels.
      predictions,labels  = eval_pred
      predictions = np.argmax(predictions,axis=1)
      return self.eval_metric_computer.compute(predictions=predictions,references=labels)


  def set_training_args(self):
      # Sets up training arguments for the Trainer class.
        return TrainingArguments(
            output_dir = self.model_out_dir,
            num_train_epochs = self.num_epochs,
            learning_rate = self.lr,
            per_device_train_batch_size = self.batch_size ,
            per_device_eval_batch_size = self.batch_size ,
            weight_decay= self.weight_decay,
            evaluation_strategy = self.eval_strategy ,
            save_strategy = self.save_strategy,
            logging_strategy = self.logging_strategy ,
            push_to_hub = self.push_to_hub
        )


  def model_trainer(self):
    # Initializes and configures the Trainer class for training the model.
      return Trainer(
          model = self.model,
          args = self.set_training_args(),
          data_collator = self.data_collator,
          train_dataset = self.train_data,
          eval_dataset = self.test_data,
          compute_metrics = self.compute_metrics



      )


  def train_save_push_to_hub(self):
    # Performs the training, saves the trained model, and pushes it to the Hugging Face Model Hub.
        trainer = self.model_trainer()
        trainer.train()
        trainer.push_to_hub()

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Main

In [None]:
if __name__ == "__main__" :
   textclassificationdataset = TextClassificationDataSet()
   train_data, test_data = textclassificationdataset.gen_classification_dataset()
   textclassificationtrainer = TextClassificationModelTrainer(train_data,test_data)
   textclassificationtrainer.train_save_push_to_hub()


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6981,0.57781,0.76575
2,0.4894,0.543145,0.78975
3,0.3605,0.591938,0.7795
4,0.2549,0.72753,0.7865
5,0.1927,0.791687,0.7815


In [None]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis", model=config.MODEL_OUT_DIR, tokenizer="distilbert-base-uncased")
classifier("People often have a need for social interaction, but they also value moments of solitude for various reasons. ")

[{'label': 'POSITIVE', 'score': 0.9823178052902222}]