In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
Collecting torch
  Downloading https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-linux_x86_64.whl (780.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.5/780.5 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

#**DataSet Loading**

In [None]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")
print("Total Memory (GB):", torch.cuda.get_device_properties(0).total_memory / 1e9 if torch.cuda.is_available() else "No GPU")

CUDA Available: True
CUDA Version: 12.1
GPU Name: Tesla T4
Total Memory (GB): 15.828320256


In [None]:
!pip install --upgrade kagglehub

import kagglehub
path = kagglehub.dataset_download("snehaanbhawal/resume-dataset")
print("Path to dataset files:", path)

import os

print("Dataset files:", os.listdir(path))

import pandas as pd

csv_file = os.path.join(path, "Resume/Resume.csv")
df = pd.read_csv(csv_file)

print(df.head())

Path to dataset files: /kaggle/input/resume-dataset
Dataset files: ['Resume', 'data']
         ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  


#**DataSet Cleaning**

In [None]:

import re
import string
import spacy
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

nltk.download("stopwords")
nlp = spacy.load("en_core_web_sm")

print(df.isnull().sum())
df.dropna(subset=["Resume_str", "Category"], inplace=True)

def remove_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

df["Resume_str"] = df["Resume_str"].apply(remove_html)

def clean_text(text):
    text = text.lower()  #lowercase conversion
    text = re.sub(r"\d+", "", text) #numbers
    text = re.sub(r"[^\w\s]", "", text)  #special characters
    text = re.sub(r"\s+", " ", text).strip()  #extra spaces
    return text

df["Resume_str"] = df["Resume_str"].apply(clean_text)

stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

df["Resume_str"] = df["Resume_str"].apply(remove_stopwords)

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

df["Resume_str"] = df["Resume_str"].apply(lemmatize_text)

df.drop_duplicates(subset=["Resume_str"], inplace=True)

df.to_csv("cleaned_resume_dataset.csv", index=False)

print(df.head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ID             0
Resume_str     0
Resume_html    0
Category       0
dtype: int64
         ID                                         Resume_str  \
0  16852973  hr administratormarkete associate hr administr...   
1  22323967  hr specialist we hr operation summary versatil...   
2  33176873  hr director summary year experience recruiting...   
3  27018550  hr specialist summary dedicate drive dynamic y...   
4  17812897  hr manager skill highlight hr skill hr departm...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  


In [None]:
print(df['Category'].unique())

['HR' 'DESIGNER' 'INFORMATION-TECHNOLOGY' 'TEACHER' 'ADVOCATE'
 'BUSINESS-DEVELOPMENT' 'HEALTHCARE' 'FITNESS' 'AGRICULTURE' 'BPO' 'SALES'
 'CONSULTANT' 'DIGITAL-MEDIA' 'AUTOMOBILE' 'CHEF' 'FINANCE' 'APPAREL'
 'ENGINEERING' 'ACCOUNTANT' 'CONSTRUCTION' 'PUBLIC-RELATIONS' 'BANKING'
 'ARTS' 'AVIATION']


In [None]:
!pip uninstall gensim -y
!pip install gensim

#**Data Augmentation**

In [None]:
import random
import nltk
import pandas as pd
from nltk.corpus import wordnet
from gensim.models import Word2Vec
from transformers import pipeline

nltk.download('wordnet')
nltk.download('omw-1.4')
paraphrase = pipeline("text2text-generation", model="humarin/chatgpt_paraphraser_on_T5_base")
def synonym_replacement(sentence, n=1):
    words = sentence.split()
    if not words:
        return sentence
    new_words = words.copy()
    for _ in range(n):
        word = random.choice(words)
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words[new_words.index(word)] = synonym
    return ' '.join(new_words)
df["synonym_replacement"] = df["Resume_str"].apply(lambda x: synonym_replacement(x))
df.to_csv("augmented_resume_dataset.csv", index=False)
print(df.head())

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cuda:0


         ID                                         Resume_str  \
0  16852973  hr administratormarkete associate hr administr...   
1  22323967  hr specialist we hr operation summary versatil...   
2  33176873  hr director summary year experience recruiting...   
3  27018550  hr specialist summary dedicate drive dynamic y...   
4  17812897  hr manager skill highlight hr skill hr departm...   

                                         Resume_html Category  \
0  <div class="fontsize fontface vmargins hmargin...       HR   
1  <div class="fontsize fontface vmargins hmargin...       HR   
2  <div class="fontsize fontface vmargins hmargin...       HR   
3  <div class="fontsize fontface vmargins hmargin...       HR   
4  <div class="fontsize fontface vmargins hmargin...       HR   

                                 synonym_replacement  
0  hr administratormarkete associate hr administr...  
1  hr specialist we hr operation summary versatil...  
2  hr director summary year experience recruiti

#**Vectorization**

In [None]:

from gensim.models import Word2Vec

word2vec_model = Word2Vec(sentences=df["synonym_replacement"], vector_size=100, window=5, min_count=1, workers=4)

def get_word2vec_vector(tokens):
    vectors = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    return sum(vectors) / len(vectors) if vectors else [0] * 100

df["word2vec_vector"] = df["synonym_replacement"].apply(get_word2vec_vector)
print(df.head())
from sklearn.feature_extraction.text import TfidfVectorizer

df["processed_text_tf-idf"] = df["synonym_replacement"].apply(lambda x: " ".join(x))

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectors = tfidf_vectorizer.fit_transform(df["synonym_replacement"])

tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

df = pd.concat([df, tfidf_df], axis=1)
print(df.head())
df.to_csv("vectorized_resume_dataset.csv", index=False)



         ID                                         Resume_str  \
0  16852973  hr administratormarkete associate hr administr...   
1  22323967  hr specialist we hr operation summary versatil...   
2  33176873  hr director summary year experience recruiting...   
3  27018550  hr specialist summary dedicate drive dynamic y...   
4  17812897  hr manager skill highlight hr skill hr departm...   

                                         Resume_html Category  \
0  <div class="fontsize fontface vmargins hmargin...       HR   
1  <div class="fontsize fontface vmargins hmargin...       HR   
2  <div class="fontsize fontface vmargins hmargin...       HR   
3  <div class="fontsize fontface vmargins hmargin...       HR   
4  <div class="fontsize fontface vmargins hmargin...       HR   

                                 synonym_replacement  \
0  hr administratormarkete associate hr administr...   
1  hr specialist we hr operation summary versatil...   
2  hr director summary year experience recru

#**Pretesting with KNN**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import pandas as pd

X = df.drop(columns=['Category', 'word2vec_vector'])
y = df['Category']
scaler = StandardScaler()
encoder = LabelEncoder()
y = encoder.fit_transform(y)
for column in X.select_dtypes(include=['object']).columns:
    X[column] = encoder.fit_transform(X[column])

imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)
X_scaled = scaler.fit_transform(X)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

KNN Accuracy: 0.4567
Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.79      0.57        28
           1       0.26      0.37      0.30        30
           2       0.23      0.30      0.26        10
           3       0.26      0.43      0.32        14
           4       0.23      0.28      0.25        18
           5       0.17      0.20      0.18         5
           6       0.52      0.61      0.56        23
           7       0.35      0.44      0.39        16
           8       1.00      0.33      0.50         3
           9       0.25      0.26      0.26        23
          10       0.83      0.61      0.70        31
          11       0.88      0.66      0.75        32
          12       0.15      0.13      0.14        23
          13       0.55      0.63      0.59        19
          14       0.61      0.64      0.62        22
          15       0.41      0.50      0.45        14
          16       0.42      0.20   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#**Testing with Traditional ML Models**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"Logistic Regression Accuracy: {accuracy_logreg:.4f}")
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_logreg))

Logistic Regression Accuracy: 0.5634
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.71      0.73        28
           1       0.54      0.43      0.48        30
           2       0.50      0.40      0.44        10
           3       0.38      0.43      0.40        14
           4       0.38      0.33      0.35        18
           5       1.00      0.40      0.57         5
           6       0.32      0.87      0.47        23
           7       0.69      0.56      0.62        16
           8       1.00      0.67      0.80         3
           9       0.47      0.39      0.43        23
          10       0.84      0.68      0.75        31
          11       0.88      0.69      0.77        32
          12       0.50      0.22      0.30        23
          13       0.94      0.84      0.89        19
          14       0.56      0.64      0.60        22
          15       0.38      0.57      0.46        14


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {accuracy_dt:.4f}")
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))

Decision Tree Accuracy: 0.5694
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.71      0.73        28
           1       0.59      0.67      0.62        30
           2       0.31      0.40      0.35        10
           3       0.39      0.50      0.44        14
           4       0.19      0.17      0.18        18
           5       0.17      0.20      0.18         5
           6       0.62      0.65      0.64        23
           7       0.35      0.38      0.36        16
           8       0.00      0.00      0.00         3
           9       0.36      0.39      0.38        23
          10       0.92      0.71      0.80        31
          11       0.90      0.81      0.85        32
          12       0.75      0.39      0.51        23
          13       0.82      0.74      0.78        19
          14       0.36      0.41      0.38        22
          15       0.53      0.71      0.61        14
          16

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.7324
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.93      0.85        28
           1       0.83      0.63      0.72        30
           2       1.00      0.40      0.57        10
           3       0.59      0.71      0.65        14
           4       1.00      0.17      0.29        18
           5       1.00      0.20      0.33         5
           6       0.83      0.87      0.85        23
           7       0.57      0.75      0.65        16
           8       0.00      0.00      0.00         3
           9       0.76      0.57      0.65        23
          10       0.86      0.77      0.81        31
          11       0.86      0.78      0.82        32
          12       0.71      0.43      0.54        23
          13       0.89      0.89      0.89        19
          14       0.90      0.86      0.88        22
          15       0.57      0.93      0.70        14
          16

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
pip install transformers datasets torch scikit-learn pandas numpy

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

#**BERT Processing (Final Classification)**

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.optim import AdamW
from transformers import get_scheduler
import torch.nn.functional as F
from tqdm import tqdm

df = df[['Resume_str', 'Category']]

label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Resume_str'].astype(str).tolist(),
    df['Category'].tolist(),
    test_size=0.2,
    random_state=42
)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Category'] = label_encoder.fit_transform(df['Category'])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class ResumeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
train_dataset = ResumeDataset(train_encodings, train_labels)
val_dataset = ResumeDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(df['Category'].unique()))
model.to(device)
for param in model.bert.parameters():
    param.requires_grad = False

for param in model.bert.encoder.layer[-3:].parameters():
    param.requires_grad = True

optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
epochs = 20

for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        batch = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 249/249 [01:28<00:00,  2.81it/s, loss=2.03]
Epoch 1: 100%|██████████| 249/249 [01:26<00:00,  2.87it/s, loss=0.74]
Epoch 2: 100%|██████████| 249/249 [01:26<00:00,  2.86it/s, loss=2.19]
Epoch 3: 100%|██████████| 249/249 [01:27<00:00,  2.85it/s, loss=0.899]
Epoch 4: 100%|██████████| 249/249 [01:27<00:00,  2.86it/s, loss=0.577]
Epoch 5: 100%|██████████| 249/249 [01:27<00:00,  2.86it/s, loss=1.69]
Epoch 6: 100%|██████████| 249/249 [01:27<00:00,  2.86it/s, loss=0.777]
Epoch 7: 100%|██████████| 249/249 [01:27<00:00,  2.86it/s, loss=0.81]
Epoch 8: 100%|██████████| 249/249 [01:27<00:00,  2.86it/s, loss=1.48]
Epoch 9: 100%|██████████| 249/249 [01:27<00:00,  2.86it/s, loss=0.681]
Epoch 10: 100%|██████████| 249/249 [01:26<00:00,  2.86it/s, loss=0.491]
Epoch 11: 100%|██████████| 249/249 [01:27<00:00,  2.86it/s, loss=0.354]
Epoch 12: 100%|██████████| 249/249 [01:27<00:00,  2.86it/s, loss=0.445]
Epoch 13: 100%|██████████| 249/249 [01:27<00:00,  2.86it/s, loss=1.74]
Epoch 14:

In [None]:
model.eval()
total, correct = 0, 0

for batch in val_loader:
    batch = {key: val.to(device) for key, val in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    predictions = torch.argmax(outputs.logits, dim=-1)
    correct += (predictions == batch["labels"]).sum().item()
    total += batch["labels"].size(0)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

Validation Accuracy: 83.70%


In [None]:
sample_text = "Highly skilled software engineer with 5+ years of experience in full-stack development"

inputs = tokenizer(sample_text, padding=True, truncation=True, return_tensors="pt", max_length=512)

inputs = {key: val.to(device) for key, val in inputs.items()}

In [None]:
with torch.no_grad():
    outputs = model(**inputs)
predicted_class = torch.argmax(outputs.logits, dim=1).item()

print(f"Predicted Category: {predicted_class}")

Predicted Category: 12


In [None]:
original_category = label_encoder.inverse_transform([predicted_class])[0]

print(f"Predicted Category (decoded): {original_category}")

Predicted Category (decoded): CONSULTANT


In [None]:
model.save_pretrained("resume_classification_model")

In [None]:
!ls


augmented_resume_dataset.csv  sample_data
cleaned_resume_dataset.csv    vectorized_resume_dataset.csv
resume_classification_model


In [None]:
import shutil
from google.colab import files

shutil.make_archive('resume_model', 'zip', 'resume_classification_model')

files.download('resume_model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>