In [1]:
import pandas as pd
import numpy as np

In [2]:
resume = pd.read_csv('Resume.csv')
resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [4]:
df = resume[['Resume_str', 'Category']]
df.columns = ['text', 'label']  # Rename columns for clarity

data = df.to_dict(orient='records')


import json

with open('resume.json', 'w') as f:
    json.dump(data, f, indent=2)


In [5]:
pip install transformers datasets

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [7]:
from datasets import load_dataset

dataset = load_dataset('json', data_files='resume.json')


Generating train split: 0 examples [00:00, ? examples/s]

In [9]:
from sklearn.model_selection import train_test_split

train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

train_df, val_df = train_test_split(train_val_df, test_size=0.1, random_state=42, stratify=train_val_df['label'])

train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)


In [10]:
from transformers import AutoTokenizer
from datasets import load_dataset, DatasetDict

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

datasets = DatasetDict({
    'train': load_dataset('csv', data_files='train_data.csv')['train'],
    'validation': load_dataset('csv', data_files='val_data.csv')['train'],
    'test': load_dataset('csv', data_files='test_data.csv')['train']
})

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = datasets.map(tokenize_function, batched=True)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1788 [00:00<?, ? examples/s]

Map:   0%|          | 0/199 [00:00<?, ? examples/s]

Map:   0%|          | 0/497 [00:00<?, ? examples/s]

In [15]:
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification

# Define the path to save the model checkpoints and logs
output_dir = './results'

training_args = TrainingArguments(
    output_dir=output_dir,                  
    per_device_train_batch_size=8,          
    per_device_eval_batch_size=8,           
    num_train_epochs=3,                     
    weight_decay=0.01,                      
    logging_dir='./logs',                   
    logging_steps=10,                       
    evaluation_strategy='epoch',            
    save_strategy='epoch',                  
    load_best_model_at_end=True,            
)


In [19]:
unique_categories = resume['Category'].unique()
num_categories = len(unique_categories)
print(f"Number of categories: {num_categories}")

Number of categories: 24


In [25]:
import pandas as pd
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, DatasetDict, ClassLabel


df = pd.read_csv('resume.csv')
df.rename(columns={'Resume_str': 'text', 'Category': 'label'}, inplace=True)

df['text'] = df['text'].astype(str)
df['label'] = df['label'].astype(str)


dataset = Dataset.from_pandas(df)

features = dataset.features.copy()
features['label'] = ClassLabel(names=df['label'].unique().tolist())

dataset = dataset.cast(features)


dataset = dataset.train_test_split(test_size=0.2, stratify_by_column='label')

dataset['train'], dataset['validation'] = dataset['train'].train_test_split(test_size=0.1, stratify_by_column='label').values()

Casting the dataset:   0%|          | 0/2484 [00:00<?, ? examples/s]

In [26]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

output_dir = './results'
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)


Map:   0%|          | 0/1788 [00:00<?, ? examples/s]

Map:   0%|          | 0/497 [00:00<?, ? examples/s]

Map:   0%|          | 0/199 [00:00<?, ? examples/s]



In [27]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['label'].unique()))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)


trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,3.162,3.147442
2,3.1918,3.14008
3,2.7909,2.755109


TrainOutput(global_step=672, training_loss=3.0928398938406083, metrics={'train_runtime': 31294.9877, 'train_samples_per_second': 0.171, 'train_steps_per_second': 0.021, 'total_flos': 1411606479273984.0, 'train_loss': 3.0928398938406083, 'epoch': 3.0})

In [28]:
eval_results = trainer.evaluate(tokenized_datasets['test'])
print(eval_results)


{'eval_loss': 2.7686209678649902, 'eval_runtime': 19695.7715, 'eval_samples_per_second': 0.025, 'eval_steps_per_second': 0.003, 'epoch': 3.0}


In [29]:
model.save_pretrained("fine-tuned-resume-model")
tokenizer.save_pretrained("fine-tuned-resume-tokenizer")

('fine-tuned-resume-tokenizer\\tokenizer_config.json',
 'fine-tuned-resume-tokenizer\\special_tokens_map.json',
 'fine-tuned-resume-tokenizer\\vocab.txt',
 'fine-tuned-resume-tokenizer\\added_tokens.json',
 'fine-tuned-resume-tokenizer\\tokenizer.json')

In [35]:
from transformers import pipeline

classifier = pipeline('text-classification', model="fine-tuned-resume-model", tokenizer="fine-tuned-resume-tokenizer")
result = classifier("We are looking for a HR SPECIALIST skilled in Microsoft office suite Dynamic with over 20 years of customer service expertise to join our dynamic team")
print(result)


[{'label': 'LABEL_15', 'score': 0.059139102697372437}]


In [34]:
result = classifier("Pick up the top 10 profiles for the following job description, We are looking for a skilled UI Developer to join our dynamic team. The ideal candidate will have a strong background in front-end development, with proficiency in HTML, CSS, JavaScript, and modern frameworks like React or Angular. Your primary responsibility will be to create visually appealing and user-friendly web interfaces that enhance user experience and align with our brand guidelines")
result

[{'label': 'LABEL_14', 'score': 0.07018663734197617}]

In [40]:
resumes.head()

NameError: name 'resumes' is not defined

In [39]:
from elasticsearch import Elasticsearch, helpers

In [43]:
def search_elasticsearch(query):
    response = es.search(
        index=index_name,
        body={
            "query": {
                "match": {
                    "content": query
                }
            }
        }
    )
    return response['hits']['hits']


In [49]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "bert-base-uncased"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
def generate_response(query):
    # Retrieve relevant documents
    hits = search_elasticsearch(query)
    documents = [hit['_source']['content'] for hit in hits]
    
    context = ' '.join(documents)
    
    inputs = tokenizer.encode(f"Context: {context} Question: {query}", return_tensors='pt')
    outputs = model.generate(inputs, max_length=500, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [52]:
from transformers import BartForConditionalGeneration, BartTokenizer

model_name = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

input_text = "Translate English to French: 'Hello, how are you?'"
inputs = tokenizer.encode(input_text, return_tensors="pt")

outputs = model.generate(inputs, max_length=50, num_beams=4, early_stopping=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Translate English to French: 'Hello, how are you?'


In [53]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

def index_resumes(resumes):
    actions = []
    for resume in resumes:
        action = {
            "_index": "resumes",  
            "_id": resume['candidate_id'],
            "_source": resume
        }
        actions.append(action)

    success, failures = bulk(es, actions, chunk_size=1000)
    print(f"Indexed {success} documents, failed {len(failures)} documents")

In [58]:
es = Elasticsearch(hosts=["http://localhost:9200"])
def search_elasticsearch(query):
    response = es.search(
        index=index_name,
        body={
  "query": {
    "multi_match": {
      "query": "UI developer",
      "fields": ["skills", "experience.title"]
    }
  }
        }
            )

