In [1]:
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import pickle
import os
from sklearn.metrics import classification_report, accuracy_score, f1_score,ConfusionMatrixDisplay, confusion_matrix
import pickle
os.environ["WANDB_DISABLED"] = "true"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

In [3]:
torch.cuda.is_available()

True

In [4]:
model_name = "bert-base-uncased"
max_length=128

In [5]:
le = pickle.load(open("Data/label_encoder.pkl", "rb"))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [7]:
target_names=le.transform(le.classes_)

In [8]:
def preprocess_function(examples):
    #print(examples['TEXT'])
    return tokenizer(examples["text"], padding=True, truncation=True)
    #return toknizer(examples["text"],padding='max_length',truncation=True, max_length=max_length)

In [9]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to("cuda")
    
    # perform inference to our model
    outputs = model(**inputs)
    #print(outputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    #print(outputs[0].softmax(1).argmax())
    # executing argmax function to get the candidate label
    return probs.argmax().item()

In [10]:
best_params = {"batch_size":8, 
              "lr": 5e-5,
              "epochs": 40}

In [11]:
train_dataset = load_dataset('csv', data_files={'train': 'Data/BERT_data_for_training'+'.csv'})
#print(train_dataset["train"][0])

tokenized_dataset_train =train_dataset.map(preprocess_function)

                                                
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
#dataset has been loaded correctly. 


In [12]:
model=BertForSequenceClassification.from_pretrained(model_name, num_labels=13)
#device="cpu"
device="cuda"
model.to(device)

#initializing training arg ith bet params
training_args = TrainingArguments(
num_train_epochs=best_params["epochs"],
learning_rate=best_params["lr"],
per_device_train_batch_size=best_params["batch_size"],  # batch size per device during training
weight_decay=0.01,               # strength of weight decay
#oad_best_model_at_end=True,
logging_steps=100,
#valuation_strategy="steps",
output_dir="Models/BERT_13classes_final_output"
)


trainer = Trainer( model=model,
              args=training_args,
              train_dataset=tokenized_dataset_train['train'],
              tokenizer=tokenizer,
              data_collator=data_collator
             )


trainer.train()

trainer.save_model("Final_Model/BERT_13classes_final")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  2%|▏         | 103/4160 [00:09<04:27, 15.18it/s]

{'loss': 1.5295, 'grad_norm': 12.741456985473633, 'learning_rate': 4.8798076923076926e-05, 'epoch': 0.96}


  5%|▍         | 203/4160 [00:15<04:09, 15.86it/s]

{'loss': 0.6369, 'grad_norm': 0.6792770624160767, 'learning_rate': 4.7596153846153844e-05, 'epoch': 1.92}


  7%|▋         | 303/4160 [00:22<04:10, 15.40it/s]

{'loss': 0.3542, 'grad_norm': 17.416851043701172, 'learning_rate': 4.6394230769230775e-05, 'epoch': 2.88}


 10%|▉         | 403/4160 [00:28<04:03, 15.45it/s]

{'loss': 0.2005, 'grad_norm': 11.018802642822266, 'learning_rate': 4.519230769230769e-05, 'epoch': 3.85}


 12%|█▏        | 500/4160 [00:34<03:57, 15.41it/s]

{'loss': 0.1471, 'grad_norm': 17.167264938354492, 'learning_rate': 4.3990384615384616e-05, 'epoch': 4.81}


 14%|█▍        | 603/4160 [01:21<03:45, 15.78it/s]  

{'loss': 0.1184, 'grad_norm': 0.17899960279464722, 'learning_rate': 4.278846153846154e-05, 'epoch': 5.77}


 17%|█▋        | 703/4160 [01:27<03:40, 15.65it/s]

{'loss': 0.0818, 'grad_norm': 0.5388522148132324, 'learning_rate': 4.1586538461538464e-05, 'epoch': 6.73}


 19%|█▉        | 801/4160 [01:34<03:54, 14.35it/s]

{'loss': 0.0429, 'grad_norm': 0.05378410965204239, 'learning_rate': 4.038461538461539e-05, 'epoch': 7.69}


 22%|██▏       | 903/4160 [01:41<03:21, 16.13it/s]

{'loss': 0.0359, 'grad_norm': 0.029273726046085358, 'learning_rate': 3.918269230769231e-05, 'epoch': 8.65}


 24%|██▍       | 1000/4160 [01:47<03:23, 15.54it/s]

{'loss': 0.061, 'grad_norm': 0.07728753983974457, 'learning_rate': 3.798076923076923e-05, 'epoch': 9.62}


 26%|██▋       | 1101/4160 [02:36<03:18, 15.39it/s]  

{'loss': 0.0237, 'grad_norm': 0.013731100596487522, 'learning_rate': 3.677884615384616e-05, 'epoch': 10.58}


 29%|██▉       | 1203/4160 [02:42<03:07, 15.75it/s]

{'loss': 0.032, 'grad_norm': 0.011457313783466816, 'learning_rate': 3.557692307692308e-05, 'epoch': 11.54}


 31%|███▏      | 1301/4160 [02:49<03:06, 15.35it/s]

{'loss': 0.0482, 'grad_norm': 0.14199525117874146, 'learning_rate': 3.4375e-05, 'epoch': 12.5}


 34%|███▎      | 1403/4160 [02:55<02:50, 16.21it/s]

{'loss': 0.0223, 'grad_norm': 0.01675434038043022, 'learning_rate': 3.3173076923076926e-05, 'epoch': 13.46}


 36%|███▌      | 1500/4160 [03:01<02:47, 15.93it/s]

{'loss': 0.0226, 'grad_norm': 0.07723343372344971, 'learning_rate': 3.1971153846153843e-05, 'epoch': 14.42}


 39%|███▊      | 1603/4160 [03:51<02:42, 15.74it/s]  

{'loss': 0.027, 'grad_norm': 0.011911777779459953, 'learning_rate': 3.0769230769230774e-05, 'epoch': 15.38}


 41%|████      | 1703/4160 [03:58<02:42, 15.10it/s]

{'loss': 0.0362, 'grad_norm': 0.015077090822160244, 'learning_rate': 2.9567307692307695e-05, 'epoch': 16.35}


 43%|████▎     | 1803/4160 [04:05<02:37, 14.95it/s]

{'loss': 0.0132, 'grad_norm': 0.012277784757316113, 'learning_rate': 2.8365384615384616e-05, 'epoch': 17.31}


 46%|████▌     | 1901/4160 [04:12<02:38, 14.25it/s]

{'loss': 0.0379, 'grad_norm': 0.0909639298915863, 'learning_rate': 2.7163461538461536e-05, 'epoch': 18.27}


 48%|████▊     | 2000/4160 [04:18<02:16, 15.77it/s]

{'loss': 0.015, 'grad_norm': 0.006825322285294533, 'learning_rate': 2.5961538461538464e-05, 'epoch': 19.23}


 51%|█████     | 2101/4160 [05:11<03:02, 11.29it/s]  

{'loss': 0.0252, 'grad_norm': 0.0051389350555837154, 'learning_rate': 2.4759615384615388e-05, 'epoch': 20.19}


 53%|█████▎    | 2201/4160 [05:19<02:29, 13.09it/s]

{'loss': 0.0274, 'grad_norm': 0.009600915014743805, 'learning_rate': 2.355769230769231e-05, 'epoch': 21.15}


 55%|█████▌    | 2303/4160 [05:26<02:14, 13.86it/s]

{'loss': 0.0289, 'grad_norm': 0.01519529614597559, 'learning_rate': 2.2355769230769233e-05, 'epoch': 22.12}


 58%|█████▊    | 2403/4160 [05:33<01:57, 14.91it/s]

{'loss': 0.0146, 'grad_norm': 0.791877806186676, 'learning_rate': 2.1153846153846154e-05, 'epoch': 23.08}


 60%|██████    | 2500/4160 [05:40<02:02, 13.59it/s]

{'loss': 0.0223, 'grad_norm': 0.024539632722735405, 'learning_rate': 1.9951923076923078e-05, 'epoch': 24.04}


 63%|██████▎   | 2603/4160 [06:30<01:41, 15.33it/s]  

{'loss': 0.0223, 'grad_norm': 0.0031727375462651253, 'learning_rate': 1.8750000000000002e-05, 'epoch': 25.0}


 65%|██████▍   | 2703/4160 [06:37<01:34, 15.45it/s]

{'loss': 0.0223, 'grad_norm': 0.11473429203033447, 'learning_rate': 1.7548076923076922e-05, 'epoch': 25.96}


 67%|██████▋   | 2801/4160 [06:43<01:29, 15.23it/s]

{'loss': 0.021, 'grad_norm': 0.08217308670282364, 'learning_rate': 1.6346153846153847e-05, 'epoch': 26.92}


 70%|██████▉   | 2903/4160 [06:50<01:22, 15.28it/s]

{'loss': 0.0207, 'grad_norm': 0.00906344223767519, 'learning_rate': 1.5144230769230769e-05, 'epoch': 27.88}


 72%|███████▏  | 3000/4160 [06:56<01:15, 15.36it/s]

{'loss': 0.0175, 'grad_norm': 4.073756217956543, 'learning_rate': 1.3942307692307693e-05, 'epoch': 28.85}


 75%|███████▍  | 3101/4160 [07:50<01:13, 14.49it/s]  

{'loss': 0.0212, 'grad_norm': 0.006131583359092474, 'learning_rate': 1.2740384615384615e-05, 'epoch': 29.81}


 77%|███████▋  | 3201/4160 [07:56<01:07, 14.23it/s]

{'loss': 0.0272, 'grad_norm': 0.0033065765164792538, 'learning_rate': 1.153846153846154e-05, 'epoch': 30.77}


 79%|███████▉  | 3301/4160 [08:03<00:55, 15.40it/s]

{'loss': 0.012, 'grad_norm': 0.01749722845852375, 'learning_rate': 1.0336538461538462e-05, 'epoch': 31.73}


 82%|████████▏ | 3401/4160 [08:10<00:50, 15.14it/s]

{'loss': 0.022, 'grad_norm': 0.0039331065490841866, 'learning_rate': 9.134615384615384e-06, 'epoch': 32.69}


 84%|████████▍ | 3500/4160 [08:16<00:40, 16.27it/s]

{'loss': 0.019, 'grad_norm': 0.009146413765847683, 'learning_rate': 7.932692307692308e-06, 'epoch': 33.65}


 87%|████████▋ | 3601/4160 [09:06<00:37, 15.09it/s]  

{'loss': 0.0224, 'grad_norm': 0.1516030728816986, 'learning_rate': 6.730769230769231e-06, 'epoch': 34.62}


 89%|████████▉ | 3701/4160 [09:12<00:32, 14.28it/s]

{'loss': 0.0274, 'grad_norm': 0.17705138027668, 'learning_rate': 5.528846153846154e-06, 'epoch': 35.58}


 91%|█████████▏| 3801/4160 [09:19<00:25, 14.05it/s]

{'loss': 0.0227, 'grad_norm': 0.0032107133883982897, 'learning_rate': 4.326923076923077e-06, 'epoch': 36.54}


 94%|█████████▍| 3901/4160 [09:26<00:18, 14.30it/s]

{'loss': 0.0078, 'grad_norm': 1.9612716436386108, 'learning_rate': 3.125e-06, 'epoch': 37.5}


 96%|█████████▌| 4000/4160 [09:33<00:11, 13.58it/s]

{'loss': 0.0235, 'grad_norm': 0.012210567481815815, 'learning_rate': 1.9230769230769234e-06, 'epoch': 38.46}


 99%|█████████▊| 4101/4160 [10:22<00:03, 15.37it/s]

{'loss': 0.0151, 'grad_norm': 0.009663000702857971, 'learning_rate': 7.211538461538462e-07, 'epoch': 39.42}


100%|██████████| 4160/4160 [11:08<00:00,  6.23it/s]


{'train_runtime': 668.2208, 'train_samples_per_second': 49.505, 'train_steps_per_second': 6.225, 'train_loss': 0.09467245889111207, 'epoch': 40.0}


## Eval

In [23]:
import torch

In [8]:
model_path = 'Models\Final_model\BERT_13classes_final'

In [10]:
!git lfs pull

Not in a Git repository.


In [11]:
model=BertForSequenceClassification.from_pretrained(model_path, num_labels=13)
tokenizer = BertTokenizerFast.from_pretrained(model_path, do_lower_case=True)

OSError: You seem to have cloned a repository without having git-lfs installed. Please install git-lfs and run `git lfs install` followed by `git lfs pull` in the folder you cloned.

In [34]:
def get_new_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to("cuda")
   # print(inputs)
    # perform inference to our model
    outputs = model(**inputs)
    #print(outputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1).argmax()
    #print(probs.item())
    #print(outputs[0].softmax(1).argmax())
    # executing argmax function to get the candidate label
    return probs.item()
    #return probs.argmax().item()



labels_dict = {'y': 'yes-answers',
               'y-d': 'yes-answers-explanations',
               'n': 'no-answers',
               'n-d': 'no-answers-explanations',
               'sno': 'statements-non-opinion',
               'so': 'statements-opinion',
               'ack': 'acknowledgments',
               'dno': "other answers",
               'query': 'interviewee-initiated questions',
               'ft': 'thanking',
               'fa': 'apologies', 
               'fe': 'explanations',
               'fp': 'conventional'
              }

In [35]:
text = "i think this works well"


In [36]:
print("The label is: ", labels_dict[le.inverse_transform([get_new_prediction(text)])[0]])

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper__index_select)

In [16]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element