In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
import os
from google.colab import drive

In [13]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
from datasets import load_dataset


dataset = load_dataset('csv', data_files={
    'train': '/content/train.csv',
    'validation': '/content/validation.csv',
    'test': '/content/test.csv'
})
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Sentence', 'Text_label'],
        num_rows: 599
    })
    validation: Dataset({
        features: ['Sentence', 'Text_label'],
        num_rows: 112
    })
    test: Dataset({
        features: ['Sentence', 'Text_label'],
        num_rows: 38
    })
})

In [22]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(example):
    return tokenizer(example["Sentence"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [24]:
label_mapping = {'Achievements': 0, 'Contact ': 1, 'Soft Skills': 2, 'Experience': 3, 'Education': 4, 'Projects': 5, 'Areas of Interest': 6, 'Skills': 7, 'Certifications': 8}
tokenized_datasets = tokenized_datasets.map(lambda x: {"labels": label_mapping[x["Text_label"]]})
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/599 [00:00<?, ? examples/s]

Map:   0%|          | 0/112 [00:00<?, ? examples/s]

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

In [27]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=9)  # Adjust `num_labels` as needed

training_args = TrainingArguments(
    output_dir="/content/bert_results",
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"]
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.204378
2,No log,0.522315
3,No log,0.22102
4,No log,0.132244
5,No log,0.078002
6,No log,0.080534
7,No log,0.073738
8,No log,0.073381
9,No log,0.074513
10,No log,0.070112


TrainOutput(global_step=380, training_loss=0.33372118096602593, metrics={'train_runtime': 675.6271, 'train_samples_per_second': 8.866, 'train_steps_per_second': 0.562, 'total_flos': 1576134275696640.0, 'train_loss': 0.33372118096602593, 'epoch': 10.0})

In [29]:
model_save_path = "/content/drive/MyDrive/BERT_FineTuned_Model2"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)


('/content/drive/MyDrive/BERT_FineTuned_Model2/tokenizer_config.json',
 '/content/drive/MyDrive/BERT_FineTuned_Model2/special_tokens_map.json',
 '/content/drive/MyDrive/BERT_FineTuned_Model2/vocab.txt',
 '/content/drive/MyDrive/BERT_FineTuned_Model2/added_tokens.json')