In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument, Word2Vec
import nltk
import pandas as pd
import numpy as np
import pdfplumber
import re
import faiss

from io import StringIO
from html.parser import HTMLParser

load additional packages for NLTK

In [None]:
nltk.download('punkt')

Rabota.ru jobs dataset was chosen for training - it's the largest one and the most diverse one probably (jobs are distributed all across Russia and IT is probably less prevalent here than on hh.ru).

In [None]:
df = pd.read_csv('data/vacancy.csv', sep='|')
df.head()

In [None]:
edf = pd.read_csv('data/en_job_postings.csv')  # kaggle LinkedIn dataset for English data
edf

Preprocessing - remove HTML tags, lowercase everything, remove punctuation

In [None]:
# HTML stripping (https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python)
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [None]:
df[['vac', 'sph', 'a_req', 'p_req', 'p_res']] = df[['vacancy_name', 'professionalSphereName', 'additional_requirements', 'position_requirements' , 'position_responsibilities']].astype(str)
df['total_req'] = df['vac'] + ' ' + df['sph'] + ' ' + df['a_req'] + ' ' + df['p_req'] + ' ' + df['p_res']
df['total_req'] = df['total_req'].str.replace('nan', '').str.strip(' ')
df['total_req'] = df['total_req'].apply(lambda x: strip_tags(x.lower()))  # remove HTML tags, lower case
# df['total_req'] = df['total_req'].apply(lambda x: re.sub(r'[^\w\s]', '', strip_tags(x.lower())))  # remove HTML tags, lower case, remove punctuation
df.head()

In [None]:
edf[['title', 'description']] = edf[['title', 'description']].astype(str) 
edf['total_req'] = edf['title'] + ' ' + edf['description']
edf['total_req'] = edf['total_req'].str.replace('nan', '').str.strip(' ')
edf['total_req'] = edf['total_req'].apply(lambda x: strip_tags(x.lower()))  # remove HTML tags, lower case
edf.head()

In [None]:
data = df['total_req'].values.tolist() + edf['total_req'].values.tolist()
tagged_data = [TaggedDocument(words = nltk.tokenize.word_tokenize(_d), tags = [str(i)]) for i, _d in enumerate(data)]

init model

In [None]:
model = Doc2Vec(
    vector_size = 80,
    min_count = 10,
    epochs = 70
)

build vocabulary

In [None]:
model.build_vocab(tagged_data)
len(model.wv.key_to_index)  # vocab size

train model

In [None]:
model.train(tagged_data,
total_examples = model.corpus_count,
epochs = model.epochs)
model.save('model/doc2vec_v4en.model')

# Get Embeddings for dataset

In [None]:
jobs = pd.read_csv('data/hhparser_vacancy_short.csv')
jobs.head()

process data

In [None]:
jobs['content'] = jobs['name'] + ' ' + jobs['description']
jobs['content'] = jobs['content'].astype(str)
jobs['content'] = jobs['content'].apply(lambda x: strip_tags(x.lower()))  # Remove HTML, lower case
# jobs['content'] = jobs['content'].apply(lambda x: re.sub(r'[^\w\s]', '', strip_tags(x.lower())))  # Remove HTML, lower case
jobs.head()

get vectors

In [None]:
# model = Doc2Vec.load('model/doc2vec_v3.model')

In [None]:
jobs['embd'] = jobs['content'].apply(lambda x: model.infer_vector(x.split()))
jobs.head()

write vectors to FAISS index and store them. id's should match the jobs csv

In [None]:
index = faiss.IndexFlatL2(80)  # size from model params in train_d2v
print(index.is_trained)
index.add(np.array(jobs.embd.values.tolist()))
print(index.ntotal)

In [None]:
faiss.write_index(index, "model/hh_v4en.index")

# Testing

In [None]:
font_sizes = []
weighted_text = []

with pdfplumber.open('data/sample2.pdf') as pdf:
    for page in pdf.pages:
        words = page.extract_words(x_tolerance=2, keep_blank_chars=True, use_text_flow=True, extra_attrs=["fontname", "size"])
        font_sizes.extend([float(word['size']) for word in words])

    # Calculate median font size
    median_size = np.median(font_sizes)
    
    # a bit of processing - some non-letter chars can be considered as spaces

    for word in words:
        word_text = word['text'].replace('/', ' ').replace('-', ' ').strip('.,').lower()
        word_size = float(word['size'])

        # Assign weight based on font size (example: 2x for each size unit above median)
        if word_size > median_size:
            weight = (word_size * 1.2 / median_size)
        else:
            weight = 1

        # Replicate word based on weight
        weighted_text.extend([word_text] * int(round(weight)))

weighted_text = " ".join([e for e in weighted_text if any(c.isalpha() for c in e)]).split()

weighted_text

In [None]:
resume = []

with pdfplumber.open('data/sample2.pdf') as pdf:
    for page in pdf.pages:
        text = page.extract_text(x_tolerance=2)
        if text:
            text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
            text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
            resume.append(text)

resume = ' '.join([e.replace('\n', ' ').lower() for e in resume])

resume

In [None]:
v1 = np.array([model.infer_vector(weighted_text)])

# find the closest embedding in index
distances, indices = index.search(v1, 5)
distances, indices

In [None]:
jobs.loc[indices[0]]