<a href="https://colab.research.google.com/github/Talha1818/Bert-Text-Classification/blob/master/Pipeline_Train_BERT_Model_Documents_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets transformers==4.28.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import pyarrow as pa
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from transformers import TrainingArguments, Trainer
import torch
import torch.nn.functional as F
import psutil
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
class LoadDataset:
  def __init__(self, file_path, token_frquency=100, start_token='<start>', end_token='<end>'):
    self.file_path = file_path
    self.token_frquency = token_frquency
    self.start_token = start_token
    self.end_token = end_token
    self.df = pd.read_csv(self.file_path)

  def load_data(self):
    self.df['token'] = self.df['token'].astype(int)
    self.df['clean_text'] = f"{self.start_token} " + self.df['clean_text'] + f" {self.end_token}"
    self.df = self.df[self.df['clean_text'].notna()]
    self.df = self.df[self.df['token'].notna()]

    # get required token frequency
    df1 = self.df['token'].value_counts().to_frame().reset_index().rename(columns={'index':'token','token':'count'})
    df_100 = df1[df1['count']>=self.token_frquency]['token'].tolist()
    df_new = self.df[self.df['token'].isin(df_100)]
    self.df = df_new
    return self.df

  def __len__(self):
        return len(self.df['token'].unique())

  def __str__(self):
        total_length =  self.__len__()
        return f"\nLoadDataset: Total Token in Dataset: {total_length}\nTotal Shape: {self.df.shape}"

In [None]:
class LabelEncoding:
  def __init__(self, df):
    self.encoder = LabelEncoder()
    self.df = df
    self.df['token'] = self.encoder.fit_transform(self.df['token'])

  def get_encoding_mapping(self):
    # Get mapping of label encoding values to original classes
    self.label_encoding_mapping = dict(zip(self.encoder.transform(self.encoder.classes_), self.encoder.classes_))
    return self.label_encoding_mapping, self.df

  def __str__(self):
          encoding, _ = self.get_encoding_mapping()
          return f"\n{encoding}"

In [None]:
file_path = "/content/drive/MyDrive/Annotation Folder/Final_Text_Dataset_22_May_2023.csv"
token_frquency = 100
start_token='<start>'
end_token='<end>'
data_object = LoadDataset(file_path, token_frquency, start_token, end_token)

# Add progress bar to track the execution
with tqdm(total=1, desc='Loading Data') as pbar:
    df = data_object.load_data()
    pbar.update(1)
print(data_object)

# encoding the token
encoder = LabelEncoding(df)
get_encoding, df_encoder = encoder.get_encoding_mapping()
print(encoder)

Loading Data:   0%|          | 0/1 [00:00<?, ?it/s]


LoadDataset: Total Token in Dataset: 130
Total Shape: (88551, 5)

{0: 0, 1: 1, 2: 2, 3: 3, 4: 11, 5: 12, 6: 13, 7: 14, 8: 15, 9: 17, 10: 20, 11: 21, 12: 22, 13: 23, 14: 24, 15: 25, 16: 27, 17: 28, 18: 29, 19: 30, 20: 31, 21: 32, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 49, 37: 50, 38: 51, 39: 52, 40: 53, 41: 54, 42: 55, 43: 62, 44: 63, 45: 64, 46: 65, 47: 66, 48: 67, 49: 68, 50: 69, 51: 70, 52: 71, 53: 72, 54: 73, 55: 74, 56: 75, 57: 76, 58: 77, 59: 78, 60: 79, 61: 80, 62: 81, 63: 84, 64: 87, 65: 89, 66: 90, 67: 93, 68: 94, 69: 95, 70: 96, 71: 97, 72: 98, 73: 99, 74: 100, 75: 101, 76: 103, 77: 110, 78: 111, 79: 112, 80: 113, 81: 117, 82: 118, 83: 123, 84: 125, 85: 126, 86: 130, 87: 133, 88: 138, 89: 141, 90: 151, 91: 153, 92: 156, 93: 165, 94: 166, 95: 168, 96: 170, 97: 179, 98: 185, 99: 191, 100: 203, 101: 214, 102: 219, 103: 222, 104: 223, 105: 224, 106: 225, 107: 239, 108: 242, 109: 257, 110: 264, 111: 288, 

In [None]:
df_encoder.head()

Unnamed: 0,page#,package_name,text,token,clean_text
0,1.0,611438_NJ_BURLINGTON,"HADDONFIELD - CITY ABSTRACT, L 1LBS 10F1\nZaSS...",8,<start> haddonfield city abstract lbs zassokae...
1,2.0,611438_NJ_BURLINGTON,"oO ANOrIRE: REDSEiY. SRARIT U Bsc, Hh Mea, ie ...",8,<start> anorire redseiy srarit bsc mea laa wit...
2,3.0,611438_NJ_BURLINGTON,"City Abstract, LLC ALTA Buyer's Settlement Sta...",4,<start> city abstract llc alta buyers settleme...
3,4.0,611438_NJ_BURLINGTON,po Description Bayer\nTitle - Lender's Title P...,4,<start> description bayer title lenders title ...
4,5.0,611438_NJ_BURLINGTON,"City Abstract, LLC ALTA Seller's Settlement St...",6,<start> city abstract llc alta sellers settlem...


In [None]:
class MyTokenizer:
  def __init__(self, tokenizer_path, df, max_length = 512):
    self.tokenizer_path = tokenizer_path
    self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
    self.max_length = max_length
    self.df = df
  
  def data_tokenization(self):
    processed_data = []
    for i in tqdm(range(len(self.df))):
      text = self.df['clean_text'].iloc[i]
      label = self.df['token'].iloc[i]

      encodings = self.tokenizer(text, padding="max_length", truncation=True, max_length=self.max_length,
                          add_special_tokens=True, # Add [CLS] and [SEP] tokens
                          )
      encodings['label'] = label
      encodings['text'] = text
      processed_data.append(encodings)
    return processed_data, self.tokenizer

In [None]:
tokenizer_path = '/content/drive/MyDrive/Annotation Folder/BERT/tokenizer'
tokenizer_obj = MyTokenizer(tokenizer_path, df_encoder)
processed_data, tokenizer = tokenizer_obj.data_tokenization()

  0%|          | 0/88551 [00:00<?, ?it/s]

In [None]:
class DataSplition:
  def __init__(self, processed_data):
    self.processed_data = processed_data
    self.new_df = pd.DataFrame(self.processed_data)

  def split_data(self, test_size=0.2, random_state=42):
    train_df, valid_df = tqdm(train_test_split(self.new_df,
                                          test_size=test_size,
                                          random_state=random_state), desc='data_splition')
  
    train_hg = Dataset(pa.Table.from_pandas(train_df))
    valid_hg = Dataset(pa.Table.from_pandas(valid_df))
    return train_hg, valid_hg

In [None]:
datasplition_obj = DataSplition(processed_data)
train_hg, valid_hg = datasplition_obj.split_data(test_size=0.2, random_state=42)

data_splition:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
class TrainModel:
  def __init__(self,model_path, tokenizer, train_dataset, eval_dataset, num_train_epochs=2):
    self.model_path = model_path
    self.tokenizer = tokenizer
    self.train_dataset = train_dataset
    self.eval_dataset = eval_dataset
    self.num_train_epochs = num_train_epochs

    self.model = AutoModelForSequenceClassification.from_pretrained(self.model_path)

  def compute_metrics(self, eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, _, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall}

  def train_model(self,):
    training_args = TrainingArguments(output_dir="/content/drive/MyDrive/Annotation Folder/result_full_data",
                                  evaluation_strategy="epoch",
                                  num_train_epochs=self.num_train_epochs,  # set the number of epochs here
                                  # resume_from_checkpoint="/content/drive/MyDrive/Annotation Folder/result_full_data/checkpoint-7000",
                                  save_strategy="steps",  # save checkpoints every epoch
                                  save_steps=1000,  # save checkpoints every 10 epochs
                                  )

    trainer = Trainer(
        model=self.model,
        args=training_args,
        train_dataset=self.train_dataset,
        eval_dataset=self.eval_dataset,
        tokenizer=self.tokenizer,
        compute_metrics=self.compute_metrics,)
  
    trainer.train()
    trainer.evaluate()

  def save_model(self, save_model_path):
    self.model.save_pretrained(save_model_path)


In [None]:
model_path = '/content/drive/MyDrive/Annotation Folder/BERT/BERT_MODEL_FULL_SAMPLE_GREATER_100_24_MAY_2023_train_1_more_epoch/'
tokenizer = tokenizer
train_dataset = train_hg
eval_dataset = valid_hg
num_train_epochs=1

model_obj = TrainModel(model_path, tokenizer, train_dataset,
                       eval_dataset, num_train_epochs)
model_obj.train_model()
model_obj.save_model("/content/drive/MyDrive/Annotation Folder/BERT/BERT_MODEL_FULL_SAMPLE_GREATER_100_24_MAY_2023_train_4/")

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall
1,0.1681,0.182558,0.961041,0.963147,0.961041


  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Save tokenizer to disk
tokenizer.save_pretrained('/content/drive/MyDrive/Annotation Folder/BERT/tokenizer_4_epoch')

('/content/drive/MyDrive/Annotation Folder/BERT/tokenizer_4_epoch/tokenizer_config.json',
 '/content/drive/MyDrive/Annotation Folder/BERT/tokenizer_4_epoch/special_tokens_map.json',
 '/content/drive/MyDrive/Annotation Folder/BERT/tokenizer_4_epoch/vocab.txt',
 '/content/drive/MyDrive/Annotation Folder/BERT/tokenizer_4_epoch/added_tokens.json',
 '/content/drive/MyDrive/Annotation Folder/BERT/tokenizer_4_epoch/tokenizer.json')

In [None]:
class ModelInference:
  def __init__(self, model_path, tokenizer_path, encoding_labels,
               max_length=512, text=None, start_token='<start> ', end_token=' <end>'):
    self.model_path = model_path
    self.tokenizer_path = tokenizer_path
    self.encoding_labels = encoding_labels
    self.text = text
    self.start_token = start_token
    self.end_token = end_token

    self.model1 = AutoModelForSequenceClassification.from_pretrained(self.model_path)
    self.tokenizer1 = AutoTokenizer.from_pretrained(self.tokenizer_path)
    self.actual_label = {k: str(v) for k, v in self.encoding_labels.items()}

  def clean_text(self):
    # Define regex patterns
    punct_pattern = r'[^\w\s]'
    num_pattern = r'\d+'
    special_pattern = r'[^A-Za-z0-9\s]'

    self.text = re.sub(r'\\n|\n', ' ', self.text)
    # remove links
    self.text = re.sub(r"http\S+", "", self.text)
    self.text = re.sub(r'\x00|\\x00', '', self.text)
    # Convert to lowercase
    self.text = self.text.lower()
    # Remove punctuation
    self.text = re.sub(punct_pattern, '', self.text)
    # Remove integers
    self.text = re.sub(num_pattern, '', self.text)
    # Remove special characters
    self.text = re.sub(special_pattern, '', self.text)
    # Remove extra whitespace
    self.text = re.sub('\s+', ' ', self.text).strip()
    # Remove nextline
    self.text = re.sub(r'\n', ' ', self.text)
    # Remove words with length 1
    self.text = ' '.join(word for word in self.text.split() if len(word) > 2)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    self.text = ' '.join(word for word in self.text.split() if word not in stop_words)
    return self.start_token + self.text + self.end_token


  def get_prediction(self):
      prompt = self.clean_text()
      encoding = self.tokenizer1(prompt, return_tensors="pt", padding="max_length", truncation=True, max_length=512,
                        add_special_tokens=True, # Add [CLS] and [SEP] tokens
                        )
      encoding = {k: v for k,v in encoding.items()}

      # Get initial CPU and RAM usage
      process = psutil.Process()
      initial_cpu = process.cpu_times()

      initial_ram = process.memory_info().rss / 1e6

      outputs = self.model1(**encoding)

      logits = outputs.logits

      probs = F.softmax(logits, dim=1).squeeze().cpu().detach().numpy()

      label = np.argmax(probs, axis=-1)

      # Get CPU and RAM usage after running the model
      cpu_usage = sum(process.cpu_times()) - sum(initial_cpu)
      ram_usage = (process.memory_info().rss / 1e6) - initial_ram
      
      return {
          'label': int(self.actual_label[label]),
          'probability': max(probs),
          'cpu_usage (seconds)': cpu_usage,
          'ram_usage (MB)': ram_usage
      }

In [None]:
model_path = '/content/drive/MyDrive/Annotation Folder/BERT/BERT_MODEL_FULL_SAMPLE_GREATER_100_24_MAY_2023_train_4/'
tokenizer_path = '/content/drive/MyDrive/Annotation Folder/BERT/tokenizer_4_epoch'
encoding_labels = get_encoding
max_length = 512
start_token='<start> '
end_token=' <end>'
text = '''
haddonfield city abstract lbs zassokaeib name dwr hie republic first bank dba republic route north suite marlton yeie ron billing
'''

In [None]:
start_token='<start> '
end_token=' <end>'

start_token + text + end_token

In [None]:
inference_obj = ModelInference(model_path, tokenizer_path, 
                               encoding_labels,max_length, text,
                               start_token, end_token)
result = inference_obj.get_prediction()
result

{'label': 15,
 'probability': 0.9987948,
 'cpu_usage (seconds)': 1.2499999999990905,
 'ram_usage (MB)': -93.76563200000055}