In [1]:
!pip install transformers datasets scikit-learn nltk
!pip install -U transformers

Collecting transformers
  Downloading transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.52.3-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.52.2
    Uninstalling transformers-4.52.2:
      Successfully uninstalled transformers-4.52.2
Successfully installed transformers-4.52.3


Import Libraries and Setup

In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Uploading Data

In [3]:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('train.csv')
df = df[['Category', 'Text']].dropna()

Saving train.csv to train.csv


Text Preprocessing

In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)  # removing punctuation
  tokens = word_tokenize(text)
  tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
  return ' '.join(tokens)

df['Cleaned_Text'] = df['Text'].apply(preprocess)

Encoding labels and splitting the data

In [6]:
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Category'])

train_texts, val_texts, train_labels, val_labels = train_test_split(df['Cleaned_Text'].tolist(), df['Label'].tolist(),
                                                                    test_size=0.2, random_state=42)

Tokenizing and Creating Dataset

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class TextDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=256):
    self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Load Pretrained BERT and set up trainer

In [8]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=43)

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
  acc = accuracy_score(labels, preds)
  return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

import os
os.environ['WANDB_DISABLED'] = 'true'

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
    )

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Train and evaluate

In [9]:
import gc
gc.collect()
torch.cuda.empty_cache()

trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.9048,0.850999,0.81404,0.807071,0.824599,0.81404
2,0.6164,0.650815,0.858476,0.857167,0.863508,0.858476
3,0.5705,0.618003,0.870426,0.870719,0.874189,0.870426


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.618003249168396,
 'eval_accuracy': 0.8704256908140403,
 'eval_f1': 0.8707193825545179,
 'eval_precision': 0.874189033124992,
 'eval_recall': 0.8704256908140403,
 'eval_runtime': 34.898,
 'eval_samples_per_second': 76.738,
 'eval_steps_per_second': 4.814,
 'epoch': 3.0}

In [11]:
model.save_pretrained('./bert-finetuned-text-classification')
tokenizer.save_pretrained('./bert-finetuned-text-classification')

('./bert-finetuned-text-classification/tokenizer_config.json',
 './bert-finetuned-text-classification/special_tokens_map.json',
 './bert-finetuned-text-classification/vocab.txt',
 './bert-finetuned-text-classification/added_tokens.json')

In [12]:
from sklearn.metrics import classification_report
preds_output = trainer.predict(val_dataset)
pred_labels = preds_output.predictions.argmax(-1)
print(classification_report(val_labels, pred_labels, target_names = label_encoder.classes_))


                           precision    recall  f1-score   support

               Accountant       0.94      0.94      0.94        79
                 Advocate       0.96      0.92      0.94        53
              Agriculture       0.86      0.73      0.79        67
                  Apparel       0.85      0.78      0.82        68
             Architecture       0.91      0.74      0.82        84
                     Arts       0.94      0.93      0.93        69
               Automobile       0.59      0.75      0.66        63
                 Aviation       0.95      0.98      0.97        58
                      BPO       0.77      0.77      0.77        52
                  Banking       0.87      0.98      0.92        55
               Blockchain       0.75      1.00      0.86         3
Building and Construction       0.86      0.92      0.89        60
         Business Analyst       0.90      0.95      0.93        65
           Civil Engineer       0.96      0.99      0.97     