In [1]:
import pandas as pd
import transformers


In [2]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer,TextClassificationPipeline

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

Data Preprocessing

In [4]:
from datasets import load_dataset
raw_datasets=load_dataset('csv',data_files={'train': 'multilabeltrain2.csv',
                                              'test':'multilabeltest2.csv'})

Using custom data configuration default-9554785825ce50c6
Reusing dataset csv (C:\Users\RmmLeo10\.cache\huggingface\datasets\csv\default-9554785825ce50c6\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)
100%|██████████| 2/2 [00:00<00:00, 1003.30it/s]


In [5]:
cols = raw_datasets["train"].column_names
raw_datasets=raw_datasets.map(lambda x : {"labels": [x[c] for c in cols if c != "text"]})

In [8]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, problem_type="multi_label_classification")
def tokenize_and_encode(examples):
  return tokenizer(examples["text"], truncation=True)
cols = raw_datasets["train"].column_names
cols.remove("labels")
ds_enc = raw_datasets.map(tokenize_and_encode, batched=True, remove_columns=cols)
ds_enc

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\RmmLeo10/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "multi_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.16.2",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at C:\User

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1155
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 289
    })
})

In [9]:
ds_enc.set_format("torch")
ds_enc = (ds_enc
          .map(lambda x : {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels"])
          .rename_column("float_labels", "labels"))

Loading cached processed dataset at C:\Users\RmmLeo10\.cache\huggingface\datasets\csv\default-9554785825ce50c6\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e\cache-f744d25ab4487878.arrow
Loading cached processed dataset at C:\Users\RmmLeo10\.cache\huggingface\datasets\csv\default-9554785825ce50c6\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e\cache-78fe83cc313b34db.arrow


Load Pretrained Model

In [10]:
num_labels=5
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to('cuda')

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\RmmLeo10/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_

In [18]:
from transformers import TrainingArguments


training_args = TrainingArguments(
    output_dir="./results",
    learning_rate = 1e-5,
    adam_beta1=0.85,
    adam_beta2=0.99,
    per_device_train_batch_size=4,
    warmup_steps=0 ,
    num_train_epochs = 15.0 ,
    weight_decay=0.01 
)
trainer = Trainer(model=model, args=training_args, train_dataset=ds_enc["train"], eval_dataset=ds_enc["test"], tokenizer=tokenizer)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
74it [01:22,  1.12s/it]


In [19]:
trainer.train()

***** Running training *****
  Num examples = 1155
  Num Epochs = 15
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 4335
 12%|█▏        | 500/4335 [00:22<02:51, 22.34it/s]Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json


{'loss': 0.0076, 'learning_rate': 8.846597462514419e-06, 'epoch': 1.73}


Model weights saved in ./results\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-500\special_tokens_map.json
 23%|██▎       | 1000/4335 [00:47<02:38, 21.04it/s]Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json


{'loss': 0.0087, 'learning_rate': 7.693194925028837e-06, 'epoch': 3.46}


Model weights saved in ./results\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-1000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-1000\special_tokens_map.json
 35%|███▍      | 1500/4335 [01:12<02:23, 19.72it/s]Saving model checkpoint to ./results\checkpoint-1500
Configuration saved in ./results\checkpoint-1500\config.json


{'loss': 0.0077, 'learning_rate': 6.539792387543253e-06, 'epoch': 5.19}


Model weights saved in ./results\checkpoint-1500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-1500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-1500\special_tokens_map.json
 46%|████▌     | 2000/4335 [01:37<01:51, 20.93it/s]Saving model checkpoint to ./results\checkpoint-2000
Configuration saved in ./results\checkpoint-2000\config.json


{'loss': 0.0069, 'learning_rate': 5.38638985005767e-06, 'epoch': 6.92}


Model weights saved in ./results\checkpoint-2000\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-2000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-2000\special_tokens_map.json
 58%|█████▊    | 2500/4335 [02:02<01:25, 21.47it/s]Saving model checkpoint to ./results\checkpoint-2500
Configuration saved in ./results\checkpoint-2500\config.json


{'loss': 0.0104, 'learning_rate': 4.232987312572088e-06, 'epoch': 8.65}


Model weights saved in ./results\checkpoint-2500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-2500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-2500\special_tokens_map.json
 69%|██████▉   | 3000/4335 [02:26<01:03, 21.10it/s]Saving model checkpoint to ./results\checkpoint-3000
Configuration saved in ./results\checkpoint-3000\config.json


{'loss': 0.0068, 'learning_rate': 3.0795847750865054e-06, 'epoch': 10.38}


Model weights saved in ./results\checkpoint-3000\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-3000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-3000\special_tokens_map.json
 81%|████████  | 3500/4335 [02:51<00:37, 22.06it/s]Saving model checkpoint to ./results\checkpoint-3500
Configuration saved in ./results\checkpoint-3500\config.json


{'loss': 0.0067, 'learning_rate': 1.926182237600923e-06, 'epoch': 12.11}


Model weights saved in ./results\checkpoint-3500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-3500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-3500\special_tokens_map.json
 92%|█████████▏| 4000/4335 [03:16<00:15, 21.29it/s]Saving model checkpoint to ./results\checkpoint-4000
Configuration saved in ./results\checkpoint-4000\config.json


{'loss': 0.0062, 'learning_rate': 7.727797001153404e-07, 'epoch': 13.84}


Model weights saved in ./results\checkpoint-4000\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-4000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-4000\special_tokens_map.json
100%|█████████▉| 4333/4335 [03:33<00:00, 20.93it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 4335/4335 [03:33<00:00, 20.34it/s]

{'train_runtime': 213.171, 'train_samples_per_second': 81.273, 'train_steps_per_second': 20.336, 'train_loss': 0.007538547207877435, 'epoch': 15.0}





TrainOutput(global_step=4335, training_loss=0.007538547207877435, metrics={'train_runtime': 213.171, 'train_samples_per_second': 81.273, 'train_steps_per_second': 20.336, 'train_loss': 0.007538547207877435, 'epoch': 15.0})

In [20]:
from sklearn.metrics import accuracy_score
predictions=trainer.predict(ds_enc['test'])
pred=np.argmax(predictions.predictions,axis=1)
labels=ds_enc['test']['labels'].cpu().detach().numpy()
label=np.argmax(labels,axis=1)
accuracy=accuracy_score(pred,label)
print('The accuracy score is',accuracy)

***** Running Prediction *****
  Num examples = 289
  Batch size = 8
 95%|█████████▍| 35/37 [00:00<00:00, 38.63it/s]

The accuracy score is 0.629757785467128


100%|██████████| 37/37 [00:15<00:00, 38.63it/s]

In [39]:
trainer.save_model('C:\RMM\Medical4\multilabelmodel')

Saving model checkpoint to C:\RMM\Medical4\multilabelmodel
Configuration saved in C:\RMM\Medical4\multilabelmodel\config.json
Model weights saved in C:\RMM\Medical4\multilabelmodel\pytorch_model.bin
tokenizer config file saved in C:\RMM\Medical4\multilabelmodel\tokenizer_config.json
Special tokens file saved in C:\RMM\Medical4\multilabelmodel\special_tokens_map.json
