In [1]:
# ===============================================================
# Resume Ranking using BERT (Bangla/English Mixed Resume possible)
# ===============================================================

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

2025-10-25 20:05:04.002547: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761422704.238016      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761422704.297197      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# ======================
# Load CSV
# ======================
csv_path ="/kaggle/input/cv-resume/dataset.csv"
df = pd.read_csv(csv_path)

# Check columns
print("Columns in CSV:", df.columns.tolist())
print("Sample data:\n", df.head())


Columns in CSV: ['Role', 'Resume', 'Decision', 'Reason_for_decision', 'Job_Description']
Sample data:
                          Role  \
0       E-commerce Specialist   
1              Game Developer   
2  Human Resources Specialist   
3       E-commerce Specialist   
4       E-commerce Specialist   

                                              Resume Decision  \
0  Here's a professional resume for Jason Jones:\...   reject   
1  Here's a professional resume for Ann Marshall:...   select   
2  Here's a professional resume for Patrick Mccla...   reject   
3  Here's a professional resume for Patricia Gray...   select   
4  Here's a professional resume for Amanda Gross:...   reject   

                                 Reason_for_decision  \
0    Lacked leadership skills for a senior position.   
1              Strong technical skills in AI and ML.   
2  Insufficient system design expertise for senio...   
3  Impressive leadership and communication abilit...   
4    Lacked leadership skil

In [3]:
# ====================================================
# 🧩 Step 2: Prepare Data
# ====================================================
texts = (
    "Role: " + df["Role"].astype(str) + ". " +
    "Job Description: " + df["Job_Description"].astype(str) + ". " +
    "Resume: " + df["Resume"].astype(str)
).tolist()

labels_text = df["Decision"].astype(str).tolist()
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels_text)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42
)

In [4]:
from transformers import BertTokenizer

# Kaggle input folder path ব্যবহার করো
tokenizer_path ="/kaggle/input/ksdfoieeknf/bert_local"
tokenizer = BertTokenizer.from_pretrained(tokenizer_path, use_fast=False)

# Test
text = "This is a test."
encoding = tokenizer(text)
print("✅ Tokenizer loaded successfully!")

✅ Tokenizer loaded successfully!


In [5]:
# আগে যেটা ছিল
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# replace করে local path use করো
tokenizer_path = "/kaggle/input/ksdfoieeknf/bert_local"
tokenizer = BertTokenizer.from_pretrained(tokenizer_path, use_fast=False)

In [6]:


print("✅ Tokenizer loaded successfully!")
print(tokenizer.tokenize("This is a working BERT tokenizer test."))


✅ Tokenizer loaded successfully!
['this', 'is', 'a', 'working', 'bert', 'token', '##izer', 'test', '.']


In [7]:

# ====================================================
# 🧺 Step 4: Dataset Class
# ====================================================
class ResumeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "label": torch.tensor(label, dtype=torch.long)
        }


In [8]:
# ====================================================
# 🚀 Step 5: Dataloader Setup
# ====================================================
batch_size = 32
train_dataset = ResumeDataset(train_texts, train_labels, tokenizer)
val_dataset = ResumeDataset(val_texts, val_labels, tokenizer)
test_dataset = ResumeDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [9]:
# ====================================================
# 🧠 Step 6: BERT-based Model
# ====================================================
class ResumeClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        cls_output = self.dropout(cls_output)
        return self.fc(cls_output)

In [10]:
# ====================================================
# ⚙️ Step 7: Training Utilities
# ====================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ResumeClassifier(num_classes=len(set(labels))).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

def evaluate(model, data_loader):
    model.eval()
    correct, total = 0, 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = 100 * correct / total
    return acc, all_preds, all_labels


def train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=2):
    for epoch in range(num_epochs):
        model.train()
        total_loss, total_correct = 0, 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            total_correct += (preds == labels).sum().item()

        train_acc = 100 * total_correct / len(train_loader.dataset)
        val_acc, _, _ = evaluate(model, val_loader)
        print(f"\nEpoch [{epoch+1}/{num_epochs}] | Train Acc: {train_acc:.2f}% | Val Acc: {val_acc:.2f}% | Loss: {total_loss/len(train_loader):.3f}")


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [11]:
# ====================================================
# 🏋️ Step 8: Train Model
# ====================================================
train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=15)

Epoch 1/15: 100%|██████████| 229/229 [06:50<00:00,  1.79s/it]



Epoch [1/15] | Train Acc: 54.40% | Val Acc: 56.76% | Loss: 0.662


Epoch 2/15: 100%|██████████| 229/229 [06:57<00:00,  1.82s/it]



Epoch [2/15] | Train Acc: 56.71% | Val Acc: 53.44% | Loss: 0.625


Epoch 3/15: 100%|██████████| 229/229 [06:57<00:00,  1.82s/it]



Epoch [3/15] | Train Acc: 57.31% | Val Acc: 54.05% | Loss: 0.617


Epoch 4/15: 100%|██████████| 229/229 [06:57<00:00,  1.82s/it]



Epoch [4/15] | Train Acc: 60.48% | Val Acc: 52.58% | Loss: 0.604


Epoch 5/15: 100%|██████████| 229/229 [06:57<00:00,  1.82s/it]



Epoch [5/15] | Train Acc: 63.92% | Val Acc: 54.91% | Loss: 0.582


Epoch 6/15: 100%|██████████| 229/229 [06:57<00:00,  1.82s/it]



Epoch [6/15] | Train Acc: 69.97% | Val Acc: 54.18% | Loss: 0.536


Epoch 7/15: 100%|██████████| 229/229 [06:57<00:00,  1.82s/it]



Epoch [7/15] | Train Acc: 77.05% | Val Acc: 56.02% | Loss: 0.457


Epoch 8/15: 100%|██████████| 229/229 [06:57<00:00,  1.82s/it]



Epoch [8/15] | Train Acc: 83.70% | Val Acc: 54.05% | Loss: 0.354


Epoch 9/15: 100%|██████████| 229/229 [06:57<00:00,  1.82s/it]



Epoch [9/15] | Train Acc: 88.63% | Val Acc: 54.42% | Loss: 0.258


Epoch 10/15: 100%|██████████| 229/229 [06:58<00:00,  1.83s/it]



Epoch [10/15] | Train Acc: 92.31% | Val Acc: 54.30% | Loss: 0.180


Epoch 11/15: 100%|██████████| 229/229 [06:57<00:00,  1.82s/it]



Epoch [11/15] | Train Acc: 94.35% | Val Acc: 54.18% | Loss: 0.136


Epoch 12/15: 100%|██████████| 229/229 [06:58<00:00,  1.83s/it]



Epoch [12/15] | Train Acc: 95.90% | Val Acc: 56.27% | Loss: 0.105


Epoch 13/15: 100%|██████████| 229/229 [06:57<00:00,  1.82s/it]



Epoch [13/15] | Train Acc: 96.70% | Val Acc: 55.90% | Loss: 0.083


Epoch 14/15: 100%|██████████| 229/229 [06:57<00:00,  1.82s/it]



Epoch [14/15] | Train Acc: 97.53% | Val Acc: 55.28% | Loss: 0.063


Epoch 15/15: 100%|██████████| 229/229 [06:57<00:00,  1.82s/it]



Epoch [15/15] | Train Acc: 97.94% | Val Acc: 53.44% | Loss: 0.051


In [12]:
# ====================================================
# 🧾 Step 9: Evaluate on Test Data + Ranking
# ====================================================
model.eval()
all_probs, all_labels, all_texts = [], [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        outputs = model(input_ids, attention_mask)
        probs = torch.softmax(outputs, dim=1)
        all_probs.extend(probs.cpu().numpy())
        all_labels.extend(batch["label"].cpu().numpy())
# Prediction probabilities
all_probs = np.array(all_probs)
pred_classes = np.argmax(all_probs, axis=1)
confidence_scores = np.max(all_probs, axis=1)

# Add predictions to DataFrame
test_df = df.iloc[test_dataset.indices if hasattr(test_dataset, 'indices') else range(len(test_dataset))].copy()
test_df["Predicted_Decision"] = label_encoder.inverse_transform(pred_classes)
test_df["Confidence_Score"] = confidence_scores

In [13]:
# Sort by confidence
ranked_df = test_df.sort_values(by="Confidence_Score", ascending=False).reset_index(drop=True)

print("\n🏆 Top 3 Best Resume Predictions:")
print(ranked_df[['Role', 'Predicted_Decision', 'Confidence_Score', 'Reason_for_decision', 'Job_Description']].head(3))

best_candidate = ranked_df.iloc[0]
print("\n🎖️ Best Candidate Summary:")
print(f"Role: {best_candidate['Role']}")
print(f"Predicted Decision: {best_candidate['Predicted_Decision']}")
print(f"Confidence: {best_candidate['Confidence_Score']*100:.2f}%")
print(f"Reason: {best_candidate['Reason_for_decision']}")
print(f"Job Description: {best_candidate['Job_Description']}")



🏆 Top 3 Best Resume Predictions:
                   Role Predicted_Decision  Confidence_Score  \
0    Software Developer             select          0.999994   
1        Data Scientist             select          0.999994   
2  Mobile App Developer             select          0.999994   

                                 Reason_for_decision  \
0  Inability to communicate clearly, Low problem-...   
1  Unsuitable for the job role, Lack of relevant ...   
2       Insufficient experience in required domains.   

                                     Job_Description  
0  Expected_experience : 0-2 years, Domains: Web ...  
1  Expected_experience : 9+ years, Domains: Data ...  
2  **Job Title:** Mobile App Developer\n\n**Job S...  

🎖️ Best Candidate Summary:
Role: Software Developer
Predicted Decision: select
Confidence: 100.00%
Reason: Inability to communicate clearly, Low problem-solving abilities
Job Description: Expected_experience : 0-2 years, Domains: Web Development, Mobile Apps


In [14]:
# Save output CSV
output_path = "ranked_resume_predictions.csv"
ranked_df.to_csv(output_path, index=False)
print(f"\n✅ Ranked results saved to: {output_path}")


✅ Ranked results saved to: ranked_resume_predictions.csv
