In [2]:
!pip install transformers datasets accelerate scikit-learn -q

import os
import re
import numpy as np
import pandas as pd
import torch
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)

In [4]:
from google.colab import files
uploaded = files.upload()


Saving Resume.csv to Resume.csv


In [5]:
!pip install transformers datasets accelerate scikit-learn -q


In [6]:
df = pd.read_csv("Resume.csv")

df = df[["Resume_str","Category"]]
df.dropna(inplace=True)
df.drop_duplicates(subset="Resume_str", inplace=True)

print(df.shape)


(2482, 2)


In [7]:
import torch
print("GPU:", torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))


GPU: True
Tesla T4


In [8]:
def clean_resume(text):
    text = str(text)

    text = re.sub(r"http\S+|\S+@\S+", " ", text)
    text = re.sub(r"\+?\d[\d\s\-]{8,}", " ", text)

    text = text.replace("\n"," ")
    text = re.sub(r"\s+", " ", text)

    first_line = text[:150]
    text = "TITLE: " + first_line + " BODY: " + text

    return text[:6000]

df["text"] = df["Resume_str"].apply(clean_resume)


In [9]:
le = LabelEncoder()
df["label"] = le.fit_transform(df["Category"])
num_labels = len(le.classes_)

print(num_labels)


24


In [10]:
train_df, test_df = train_test_split(
    df[["text","label"]],
    test_size=0.1,
    stratify=df["label"],
    random_state=42
)


In [11]:
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_df["label"]),
    y=train_df["label"]
)

class_weights = torch.tensor(class_weights, dtype=torch.float)
class_weights


tensor([0.8778, 0.8778, 1.6323, 1.0694, 1.0004, 2.9076, 0.8946, 0.8946, 4.6521,
        0.8615, 0.8778, 0.9212, 0.8946, 0.9692, 1.0819, 0.8778, 0.8861, 0.8861,
        0.8946, 0.9398, 0.8615, 0.9304, 0.8946, 1.0113])

In [12]:
train_ds = Dataset.from_pandas(train_df)
test_ds  = Dataset.from_pandas(test_df)


In [13]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

train_ds = train_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

train_ds.set_format("torch", columns=["input_ids","attention_mask","label"])
test_ds.set_format("torch", columns=["input_ids","attention_mask","label"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2233 [00:00<?, ? examples/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted"),
    }


In [16]:
training_args = TrainingArguments(
    output_dir="./bert_resume_out",

    num_train_epochs=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=150,

    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,

    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,

    logging_steps=50,
    report_to="none"
)


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


In [18]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,3.112845,2.818645,0.321285,0.240253,0.266842
2,1.727512,1.204033,0.783133,0.674774,0.742477
3,0.892827,0.859272,0.815261,0.744714,0.795658
4,0.678878,0.692692,0.835341,0.763786,0.820797
5,0.489077,0.668673,0.84739,0.795881,0.841971
6,0.408269,0.624272,0.875502,0.831029,0.869978
7,0.325357,0.628604,0.86747,0.825612,0.863764
8,0.323655,0.630983,0.86747,0.847979,0.865095


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

TrainOutput(global_step=1120, training_loss=1.0268794809068953, metrics={'train_runtime': 2061.1894, 'train_samples_per_second': 8.667, 'train_steps_per_second': 0.543, 'total_flos': 4701144322473984.0, 'train_loss': 1.0268794809068953, 'epoch': 8.0})

In [19]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.6309834122657776, 'eval_accuracy': 0.8674698795180723, 'eval_f1_macro': 0.8479791705878662, 'eval_f1_weighted': 0.8650950423187196, 'eval_runtime': 8.2606, 'eval_samples_per_second': 30.143, 'eval_steps_per_second': 1.937, 'epoch': 8.0}


In [20]:
print(trainer.state.best_model_checkpoint)
print(trainer.state.best_metric)


./bert_resume_out/checkpoint-1120
0.8479791705878662


In [21]:
print(os.listdir())


['.config', 'Resume.csv', 'bert_resume_out', 'sample_data']


In [23]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [24]:
import os
print(os.listdir("/content/drive/MyDrive"))


['Rahul-Interview.gdoc', 'Lookbook.gslides', 'cast.gslides', 'UNIVERSE ACCORDING TO THE HOLY SCRIPTURES.gslides', 'My Time Table 2020-21-22(1)(1).pdf', 'WhatsApp Image 2023-08-21 at 10.37.16 PM.jpeg', 'Resume', 'Untitled Diagram', 'Resumes', 'Resume 074.pdf', 'RESUME076.pdf', 'Resume80.pdf', 'Untitled spreadsheet (2).gsheet', 'Untitled spreadsheet (1).gsheet', 'api.csv', 'api (1).gsheet', 'api.gsheet', 'can you create one work breakdown structure \nthis....gsheet', 'WBS.csv', 'Untitled spreadsheet.gsheet', 'RITIK-RESUME (1).pdf', 'Colab Notebooks', 'RITIK-RESUME.pdf', 'signature.pdf', 'DL_project', '.ipynb_checkpoints']


In [25]:
import os, pickle

save_path = "/content/drive/MyDrive/DL_project/BERT_resume_model"

os.makedirs(save_path, exist_ok=True)

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

with open(f"{save_path}/label_encoder.pkl","wb") as f:
    pickle.dump(le,f)

print("✅ Model saved permanently to Drive")
print("Saved files:", os.listdir(save_path))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Model saved permanently to Drive
Saved files: ['config.json', 'model.safetensors', 'training_args.bin', 'tokenizer_config.json', 'tokenizer.json', 'label_encoder.pkl']
