In [None]:
!pip install transformers datasets accelerate scikit-learn -q

import os
import re
import numpy as np
import pandas as pd
import torch
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
!pip install transformers datasets accelerate scikit-learn -q


In [None]:
df = pd.read_csv("Resume.csv")

df = df[["Resume_str","Category"]]
df.dropna(inplace=True)
df.drop_duplicates(subset="Resume_str", inplace=True)

print(df.shape)


In [None]:
import torch
print("GPU:", torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))


In [None]:
def clean_resume(text):
    text = str(text)

    text = re.sub(r"http\S+|\S+@\S+", " ", text)
    text = re.sub(r"\+?\d[\d\s\-]{8,}", " ", text)

    text = text.replace("\n"," ")
    text = re.sub(r"\s+", " ", text)

    first_line = text[:150]
    text = "TITLE: " + first_line + " BODY: " + text

    return text[:6000]

df["text"] = df["Resume_str"].apply(clean_resume)


In [None]:
le = LabelEncoder()
df["label"] = le.fit_transform(df["Category"])
num_labels = len(le.classes_)

print(num_labels)


In [None]:
train_df, test_df = train_test_split(
    df[["text","label"]],
    test_size=0.1,
    stratify=df["label"],
    random_state=42
)


In [None]:
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_df["label"]),
    y=train_df["label"]
)

class_weights = torch.tensor(class_weights, dtype=torch.float)
class_weights


In [None]:
train_ds = Dataset.from_pandas(train_df)
test_ds  = Dataset.from_pandas(test_df)


In [None]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

train_ds = train_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

train_ds.set_format("torch", columns=["input_ids","attention_mask","label"])
test_ds.set_format("torch", columns=["input_ids","attention_mask","label"])


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted"),
    }


In [None]:
training_args = TrainingArguments(
    output_dir="./bert_resume_out",

    num_train_epochs=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=150,

    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,

    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,

    logging_steps=50,
    report_to="none"
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


In [None]:
trainer.train()


In [None]:
metrics = trainer.evaluate()
print(metrics)


In [None]:
print(trainer.state.best_model_checkpoint)
print(trainer.state.best_metric)


In [None]:
print(os.listdir())


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os
print(os.listdir("/content/drive/MyDrive"))


In [None]:
import os, pickle

save_path = "/content/drive/MyDrive/DL_project/BERT_resume_model"

os.makedirs(save_path, exist_ok=True)

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

with open(f"{save_path}/label_encoder.pkl","wb") as f:
    pickle.dump(le,f)

print("âœ… Model saved permanently to Drive")
print("Saved files:", os.listdir(save_path))
