In [228]:
# PyPDF2
from PyPDF2 import PdfReader
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re


from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

### Function to Clean Text

In [229]:
def cleanText(cleanText):
    # Step 1
    cleanText = re.sub('http\S+\s*', ' ', cleanText)  # remove URLs
    cleanText = re.sub('RT|cc', ' ', cleanText)  # remove RT and cc
    cleanText = re.sub('#\S+', '', cleanText)  # remove hashtags
    cleanText = re.sub('@\S+', '  ', cleanText)  # remove mentions
    cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)  # remove punctuations
    cleanText = re.sub(r'[^\x00-\x7f]',r' ', cleanText) 
    cleanText = re.sub('\s+', ' ', cleanText)  # remove extra whitespace

    # Step 2
    cleanText = re.sub('[^A-Za-z]', ' ', cleanText)
    cleanText = " ".join(cleanText.split()).lower() #remove extra and trailing space + map to lower case

    # Step 3 - Clean Stopwords - Using NLTK Lib
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(cleanText)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    #with no lower case conversion
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)

    final_cleanText = ' '.join(filtered_sentence)
    
    return final_cleanText


### Read Resume in PDF file

In [230]:
reader = PdfReader("./resume_pdf/cv_sophal.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[0]
text = page.extract_text()
text

'PAGE 1  Thear Sophal          Address  : Str.289, Sangkat Boeung Kak 2, Khan Toul Kouk, Phnom Penh       Contact  : +855 81 591 594 / LinkedIn       Email  : sophal.thear@cadt.edu.kh / thearsophaltsp@gmail.com    Objective Apply for Data Science Researcher Personal  Information Gender             :  Male Nationality      :  Cambodian Date of Birth   :  24th July 1999 Place of Birth  :  Phnom Penh, Cambodia   Internship, Job, and Other Experiences  August/2023 – Present Data Science Researcher at Cambodia Academy of Digital Technology, Phnom Penh, Cambodia Responsibilities: • Digital Essential Skill Data Analysis Project • Resume Ranking Base on Job Description Project • Teaching Assistant of Information Visualization Course  February/2023 – July/2023 (Research Project) Predicting Proteins Involved in Secretion Systems: A Large, Imbalanced Supervised Learning Problem at TIMC, Grenoble, France Responsibilities: • Study statistic information about protein data (~84 millions proteins) • C

In [231]:
reader = PdfReader("./resume_pdf/cv_sophal.pdf")
combined_text = ''
for i in range(number_of_pages):
    page = reader.pages[i]
    text = page.extract_text()
    combined_text = combined_text + text
    

In [232]:
resume = cleanText(combined_text)
resume

'page thear sophal address str sangkat boeung kak khan toul kouk phnom penh contact linkedin email sophal thear thearsophaltsp objective apply data science researcher personal information gender male nationality cambodian date birth th july place birth phnom penh cambodia internship job experiences august present data science researcher cambodia academy digital technology phnom penh cambodia responsibilities digital essential skill data analysis project resume ranking base job description project teaching assistant information visualization course february july research project predicting proteins involved secretion systems large imbalanced supervised learning problem timc grenoble france responsibilities study statistic information protein data millions proteins cluster protein reduce similar protein find suitable techniques sample positive negative class create machine learning baseline predict protein function secretion system ml modelling analysis february july research project dat

In [233]:
count_word = resume.split()
print("Word Count:", len(count_word))

Word Count: 336


In [234]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [235]:
def calculate_similarity(resume, jd):
    embedding_1 = model.encode([resume])
    embedding_2 = model.encode([jd])

    similarity_rate = cosine_similarity(embedding_1, embedding_2)
    similarity_rate_percentage = format(similarity_rate[0][0]*100, '.2f')
    # print("Similarity Rate  :", format(similarity_rate[0][0]*100, '.2f')+'%')
    
    return [float(similarity_rate), float(similarity_rate_percentage)]
    

In [236]:
jd = '2 positions for cambodia boa registered architect or senior architect. Hq in singapore with offices in malaysia thailand the philippines vietnam china hong kong macau and the UK able to perform conceptual design up to design development. Tender documentation is a plus. Able to present independently to clients and lead team members undertaking projects and coordinating with other consultants and clients. Design of site plans CAD layout elevation and detailed drawings. Well versed in design and technical knowledge confident to work with overseas project and overseas offices good managerial and team player skills ensure projects are completed in accordance to internal guidelines and compliance with the relevant government regulatory requirement.please send your CV to smid.career@gmail.com.'
jd = cleanText(jd)
jd

'positions cambodia boa registered architect senior architect hq singapore offices malaysia thailand philippines vietnam china hong kong macau uk able perform conceptual design design development tender documentation plus able present independently clients lead team members undertaking projects coordinating consultants clients design site plans cad layout elevation detailed drawings well versed design technical knowledge confident work overseas project overseas offices good managerial team player skills ensure projects completed ordance internal guidelines compliance relevant government regulatory requirement please send cv smid career'

In [237]:
calculate_similarity(resume, jd)

[0.31990906596183777, 31.99]

### Read Job Description Dataset

In [238]:
df_jd = pd.read_excel('./dataset/job_description.xlsx')
df_jd_it_job_only = df_jd[df_jd['Job_Category'] == 'IT'] # dataframe job related to IT

df_jd = df_jd[['Position', 'Job_Description']]
df_jd = df_jd.rename(columns={'Position': 'position', 'Job_Description': 'job_description'})
df_jd.head(3)

Unnamed: 0,position,job_description
0,Poster Designer,Japnese language understanding. Video editing ...
1,BOA-Registered Architect/ Senior Architect,2 positions for cambodia boa registered archit...
2,Interior Architect/ Intermediate Interior Desi...,The company is seeking an experienced INTERIOR...


In [239]:
df_jd_it_job_only = df_jd_it_job_only[['Position', 'Job_Description']]
df_jd_it_job_only = df_jd_it_job_only.rename(columns={'Position': 'position', 'Job_Description': 'job_description'})
df_jd_it_job_only.head(3)

df_jd_it_job_only['final_cleaned_jd'] = df_jd_it_job_only.job_description.apply(lambda x: cleanText(x))

In [240]:
df_jd_it_job_only

Unnamed: 0,position,job_description,final_cleaned_jd
95,Accelerated Skills & Knowledge,Introduction. Amret has developed a dedicated ...,introduction amret developed dedicated program...
675,Accounting & System Coordinator,Daily support on system issue error from users...,daily support system issue error users control...
991,Acquiring & Ecommerce Officer,Key responsibilities. Onsite offsite training ...,key responsibilities onsite offsite training s...
1501,Analyst Cybersecurity,Review all activity security alert and analyst...,review activity security alert analyst endpoin...
1513,Analyst Programmer,,none
...,...,...,...
26236,Senior IT Internal Audit Officer,Prepare detail plan of IT related for the year...,prepare detail plan related year aligning over...
26237,Senior Quality Assurance Specialist,Work with project team development and busines...,work project team development business team cr...
26238,Senior Branch Network Officer,Develop tool and follow up the office ATM rent...,develop tool follow office atm rental contract...
26260,Web And Apps Development Expert,Job description under the supervision of the l...,job description supervision lead trainer parti...


In [241]:
# clean job description
# df_jd['final_cleaned_jd'] = df_jd.job_description.apply(lambda x: cleanText(x))

In [242]:
# df_jd.shape

In [243]:
# df_jd.head(15)

In [244]:
score = []
i = 1
for jd in df_jd_it_job_only['final_cleaned_jd']:
    result = calculate_similarity(resume, jd)
    print(i, result)
    print()
    i += 1
    score.append(result[1])

1 [0.19725847244262695, 19.73]

2 [0.24017667770385742, 24.02]

3 [0.21468698978424072, 21.47]

4 [0.24306733906269073, 24.31]

5 [0.0934692919254303, 9.35]

6 [0.14646324515342712, 14.65]

7 [0.10422974079847336, 10.42]

8 [0.21794086694717407, 21.79]

9 [0.31328922510147095, 31.33]

10 [0.31328922510147095, 31.33]

11 [0.05590354651212692, 5.59]

12 [0.30731749534606934, 30.73]

13 [0.16599448025226593, 16.6]

14 [0.16599448025226593, 16.6]

15 [0.16599448025226593, 16.6]

16 [0.16599448025226593, 16.6]

17 [0.16599448025226593, 16.6]

18 [0.1299818754196167, 13.0]

19 [0.21016404032707214, 21.02]

20 [0.25630730390548706, 25.63]

21 [0.3081475794315338, 30.81]

22 [0.3081475794315338, 30.81]

23 [0.3081475794315338, 30.81]

24 [0.2618781328201294, 26.19]

25 [0.2648784816265106, 26.49]

26 [0.275544136762619, 27.55]

27 [0.17819856107234955, 17.82]

28 [0.275544136762619, 27.55]

29 [0.1950526088476181, 19.51]

30 [0.2083686888217926, 20.84]

31 [0.2703837454319, 27.04]

32 [0.31897

In [245]:
sorted(score, reverse=True)

[56.18,
 56.18,
 56.18,
 55.06,
 53.6,
 53.6,
 51.24,
 50.08,
 49.73,
 49.73,
 49.73,
 49.73,
 48.27,
 48.26,
 47.6,
 47.6,
 47.6,
 47.6,
 47.42,
 47.22,
 46.99,
 46.94,
 46.94,
 46.32,
 46.03,
 45.93,
 45.69,
 45.6,
 45.59,
 45.47,
 45.45,
 45.45,
 45.45,
 45.45,
 45.45,
 45.45,
 45.27,
 45.05,
 44.51,
 44.31,
 44.22,
 43.82,
 43.73,
 43.54,
 43.54,
 43.41,
 43.36,
 43.36,
 43.06,
 43.03,
 42.71,
 42.45,
 42.33,
 42.26,
 42.06,
 41.93,
 41.9,
 41.87,
 41.87,
 41.87,
 41.67,
 41.47,
 41.39,
 41.38,
 41.31,
 41.31,
 41.31,
 41.11,
 41.05,
 41.05,
 41.05,
 40.91,
 40.91,
 40.91,
 40.65,
 40.52,
 40.52,
 40.43,
 40.43,
 40.23,
 40.11,
 39.99,
 39.83,
 39.76,
 39.69,
 39.63,
 39.57,
 39.52,
 39.32,
 39.3,
 39.25,
 39.19,
 39.18,
 39.07,
 39.0,
 38.66,
 38.58,
 38.39,
 38.21,
 38.17,
 38.12,
 37.88,
 37.88,
 37.86,
 37.86,
 37.86,
 37.86,
 37.84,
 37.83,
 37.83,
 37.82,
 37.82,
 37.82,
 37.74,
 37.74,
 37.74,
 37.62,
 37.58,
 37.58,
 37.58,
 37.58,
 37.33,
 37.26,
 37.26,
 37.26,
 37.23,
 3