In [1]:
# STEP 1: Install Required Packages
!pip install -q pandas numpy transformers torch scikit-learn nltk joblib gradio PyPDF2


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# STEP 2: Import Libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset
import joblib
import gradio as gr
import PyPDF2


In [3]:
# Download stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# STEP 3: Load Dataset with Encoding Fix
file_path = "/content/UpdatedResumeDataSet.csv"

# Try reading with the specified encoding
try:
    data = pd.read_csv(file_path, encoding="utf-8", errors="replace")
except TypeError:
    # If 'errors' is not supported, try reading without it and handle encoding issues later
    data = pd.read_csv(file_path, encoding="utf-8")
    # You might need to handle encoding errors manually here if they occur

print("Columns in dataset:", data.columns)
resume_column = "Resume"
category_column = "Category"

Columns in dataset: Index(['Category', 'Resume'], dtype='object')


In [5]:
# STEP 4: Clean and Preprocess Text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text.encode("utf-8", errors="ignore").decode("utf-8")  # Remove bad encoding

data['cleaned_resume'] = data[resume_column].apply(clean_text)


In [6]:
# STEP 5: Tokenization & Dataset Prep
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class ResumeDataset(Dataset):
    def __init__(self, resumes, labels):
        self.resumes = resumes
        self.labels = labels

    def __len__(self):
        return len(self.resumes)

    def __getitem__(self, idx):
        text = self.resumes[idx]
        label = self.labels[idx]
        inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        return inputs['input_ids'].squeeze(), inputs['attention_mask'].squeeze(), torch.tensor(label)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [7]:
# Convert Categories to Labels
categories = list(data[category_column].unique())
data['category_label'] = data[category_column].apply(lambda x: categories.index(x))


In [8]:
# STEP 7: Split Data
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_resume'], data['category_label'], test_size=0.2, random_state=42)
train_dataset = ResumeDataset(list(X_train), list(y_train))
test_dataset = ResumeDataset(list(X_test), list(y_test))
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)


In [9]:
# STEP 8: Model Definition
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(categories))
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()



Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# STEP 9: Train Model
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 1027.3183
Epoch 2, Loss: 334.1841
Epoch 3, Loss: 80.4651


In [11]:
# STEP 10: Evaluate Model
model.eval()
predictions, actuals = [], []
with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        actuals.extend(labels.cpu().numpy())

accuracy = accuracy_score(actuals, predictions)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 100.00%


In [12]:
# STEP 11: Save Model
model.save_pretrained("distilbert_resume_model")
tokenizer.save_pretrained("distilbert_tokenizer")
joblib.dump(categories, "category_mapping.pkl")



['category_mapping.pkl']

In [13]:

# STEP 12: Gradio GUI with PDF Upload (Improved Text Extraction)
def extract_text_from_pdf(file):
    reader = PyPDF2.PdfReader(file)
    text = []
    for page in reader.pages:
        extracted = page.extract_text()
        if extracted:
            text.append(extracted)
    return " ".join(text).strip()

def classify_resume_from_pdf(pdf_file):
    raw_text = extract_text_from_pdf(pdf_file)
    if not raw_text:
        return "⚠️ Could not extract any text from the uploaded PDF."

    cleaned_text = clean_text(raw_text)
    inputs = tokenizer(cleaned_text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        pred = torch.argmax(outputs.logits, dim=1).item()

    return f"📄 Predicted Category: **{categories[pred]}**"

interface = gr.Interface(
    fn=classify_resume_from_pdf,
    inputs=gr.File(label="Upload Resume (PDF Only)", file_types=[".pdf"]),
    outputs=gr.Markdown(label="Prediction"),
    title="Resume Category Classifier (PDF)",
    description="Upload a PDF resume to get the predicted job category using a fine-tuned DistilBERT model."
)

interface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f5930bdf82f7fc71da.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


