# Applicant Ranking

In [37]:
import pandas as pd
import numpy as np

In [38]:
df_jobs = pd.read_csv('fake_jobs.csv')
df_resume = pd.read_csv('fake_resume.csv')

In [39]:
df_jobs

Unnamed: 0,title,description,skills,education,experience
0,Senior Backend Developer,We are seeking a highly skilled Senior Backend...,"Python, Django, RESTful APIs, SQL",Bachelor in Computer Engineering,3 years of software development experience in ...
1,Full Stack Developer,We are hiring a versatile Full Stack Developer...,"HTML, CSS, JavaScript, Node.js, Express, MongoDB",Bachelor Degree in Computer Science,2 years of data analysis experience using Exce...
2,Python Backend Engineer,We have an exciting opportunity for a Python B...,"Python, Django, Flask, SQL, AWS",Masters in IT,4 years of marketing experience with a focus o...
3,Backend Software Engineer,We are looking for a talented Backend Software...,"Java, Spring, Hibernate, RESTful APIs, MySQL",PhD in Computer Science,2 years of web development experience using Dj...
4,Junior Backend Developer,We are seeking a motivated Junior Backend Deve...,"Python, Django, Git, Linux, SQL",Bachelor in Computer Engineering,1 year of data analytics experience with Table...


In [40]:
df_resume

Unnamed: 0,Category,Resume,education,skills,projects,experience
0,Java Developer,"Operating Systems Windows XP, 7, 10. Tools/Pac...",Bachelor in Computer Engineering,"C, Python, JavaScript","[{'title': 'Project X', 'description': 'Create...",5 years of experience in software development ...
1,Java Developer,"Operating Systems Windows XP, 7, 10. Tools/Pac...",Bachelor Degree in Computer Science,"Python, Java, HTML, CSS","[{'title': 'Project A', 'description': 'Develo...",3 years of experience in data science and mach...
2,Python Developer,â¢ Operating Systems: Windows â¢ Others: MS ...,Masters in IT,"C, Python, Java, SQL, Go","[{'title': 'Project Z', 'description': 'Design...",1 years of experience in project management an...
3,Python Developer,â¢ Operating Systems: Windows â¢ Others: MS ...,PhD in Computer Science,"C, Python, Django, Pandas","[{'title': 'Project K', 'description': 'Create...",2 years of experience in mobile app developmen...
4,Python Developer,â¢ Operating Systems: Windows â¢ Others: MS ...,Bachelor in Computer Engineering,"Python, JavaScript, ReactJS, Express, Node","[{'title': 'Project K', 'description': 'Create...",4 years of experience in financial analysis an...


## Preprocessing

In [41]:
import re

def clean_description(text):
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\+]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    return text

clean_description("I have skills in developing and maintaining software applications using Python, Java, and JavaScript.")

'i have skills in developing and maintaining software applications using python  java  and javascript '

In [42]:
df_resume['clean_resume'] = df_resume['Resume'].apply(clean_description)
df_resume.head(2)

Unnamed: 0,Category,Resume,education,skills,projects,experience,clean_resume
0,Java Developer,"Operating Systems Windows XP, 7, 10. Tools/Pac...",Bachelor in Computer Engineering,"C, Python, JavaScript","[{'title': 'Project X', 'description': 'Create...",5 years of experience in software development ...,operating systems windows xp tools pac...
1,Java Developer,"Operating Systems Windows XP, 7, 10. Tools/Pac...",Bachelor Degree in Computer Science,"Python, Java, HTML, CSS","[{'title': 'Project A', 'description': 'Develo...",3 years of experience in data science and mach...,operating systems windows xp tools pac...


In [43]:
df_resume_rankings = df_resume.copy()

## Tfidf
For comparing job-description and resume-description

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [45]:
# Initialize the TfidfVectorizer
description_vectorizer = TfidfVectorizer(stop_words='english')

# fit_transform the vectorizers and create tfidf matrix
description_matrix = description_vectorizer.fit_transform(df_resume['clean_resume'])

Preprocess job description  
This adds the title, and skills to the description.  
Then cleans it.

In [46]:
def preprocess_job(target_job):
    title, description, skills = target_job.title, target_job.description, target_job.skills
    # Clean description
    if skills:
        description = skills.replace(',', '') + ' ' + description
    if title:
        description = f'{title} {description}'
    description = clean_description(description)
    print('Cleaned job description:', description)
    return description

In [47]:

def get_description_score(target_job):
    job_description = preprocess_job(target_job)

    # Compute vectorizer
    query_description_vec = description_vectorizer.transform([job_description])

    # Compute cosine similarity
    cosine_sim_description = cosine_similarity(query_description_vec, description_matrix).flatten()

    df_resume_rankings['description_score'] = cosine_sim_description

# Choose target job

In [48]:
target_job = df_jobs.loc[2]

In [49]:
get_description_score(target_job)

Cleaned job description: python backend engineer python django flask sql aws we have an exciting opportunity for a python backend engineer to join our growing team  in this role  you will be responsible for building robust and scalable web applications using python and frameworks like django and flask  your expertise in server side logic  api integrations  and database management will be crucial to the success of our projects  proficiency in sql databases and experience with cloud platforms such as aws are highly desirable  you will collaborate closely with frontend developers to ensure seamless integration of user interfaces and backend functionality  if you are a detail oriented and passionate backend engineer who enjoys solving complex problems  we look forward to reviewing your application 


## 1. Skills

#### How it works?
For each applicant, a binary vector is created where each element represents the presence (1) or absence (0) of a specific skill from the target job in the applicant's skills.

In [50]:
# Calculate cosine similarity between job description vector and each applicant vector
def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    if norm_vector2 == 0 or norm_vector1 == 0:
        similarity = 0
    else:
        similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity

**Convert job description skills and applicant skills into binary feature vectors**

In [51]:
target_job['skills'].split(', ')

['Python', 'Django', 'Flask', 'SQL', 'AWS']

In [52]:
def vectorize_skills(applicant_skills):
    # The 'skills' column has string values. Each string has skills sepearated by comma. So convert them to array of skills.
    target_job_skills = target_job['skills'].split(', ')
    applicant_skills = applicant_skills.split(', ')
    applicant_vector = [1 if skill in applicant_skills else 0 for skill in target_job_skills]
    return applicant_vector

In [53]:
target_job['skills_vector'] = vectorize_skills(target_job.skills)
target_job.skills, target_job.skills_vector

('Python, Django, Flask, SQL, AWS', [1, 1, 1, 1, 1])

In [54]:
df_resume_rankings['skills_vector'] = df_resume['skills'].apply(vectorize_skills)
df_resume_rankings[['skills', 'skills_vector']]

Unnamed: 0,skills,skills_vector
0,"C, Python, JavaScript","[1, 0, 0, 0, 0]"
1,"Python, Java, HTML, CSS","[1, 0, 0, 0, 0]"
2,"C, Python, Java, SQL, Go","[1, 0, 0, 1, 0]"
3,"C, Python, Django, Pandas","[1, 1, 0, 0, 0]"
4,"Python, JavaScript, ReactJS, Express, Node","[1, 0, 0, 0, 0]"


In [55]:
def get_skills_score():
    similarity_scores = []
    for applicant_vector in df_resume_rankings['skills_vector']:
        similarity = cosine_similarity(target_job['skills_vector'], applicant_vector)
        similarity_scores.append(similarity)
    df_resume_rankings['skills_score'] = similarity_scores

In [56]:
get_skills_score()

In [57]:
df_resume_rankings[['skills', 'skills_vector', 'skills_score']]

Unnamed: 0,skills,skills_vector,skills_score
0,"C, Python, JavaScript","[1, 0, 0, 0, 0]",0.447214
1,"Python, Java, HTML, CSS","[1, 0, 0, 0, 0]",0.447214
2,"C, Python, Java, SQL, Go","[1, 0, 0, 1, 0]",0.632456
3,"C, Python, Django, Pandas","[1, 1, 0, 0, 0]",0.632456
4,"Python, JavaScript, ReactJS, Express, Node","[1, 0, 0, 0, 0]",0.447214


## 2. Projects

#### How it works?
***It checks if the applicant has made at least one project using the required skills or not.***

1. The job description skills are converted into a binary feature vector, where each element represents the presence or absence of a skill in the job description.

2. A helper function skill_present is defined to check if a skill is present in the project description using regular expression pattern matching to match the whole word.

3. The vectorize_project function takes the applicant projects and job skills as input and converts the project descriptions into binary feature vectors. For each applicant, it iterates over the skills and checks if any of the skills are present in the project descriptions using the skill_present function.

4. The cosine similarity is calculated between the job_skills_vector and each applicant's project vector using the cosine_similarity function. The similarity score represents how closely the applicant's projects match the job description skills.

In [58]:
df_resume_rankings['projects'][0]

"[{'title': 'Project X', 'description': 'Created a mobile app for social networking using Python'}, {'title': 'Project Y', 'description': 'Built a recommendation system for movie ratings using Java'}]"

This is supposed to be list of dictionary. But it is stored as string.

In [59]:
import ast

In [60]:
# Convert the string representation to a list of dictionaries
df_resume_rankings['projects'] = df_resume_rankings['projects'].apply(ast.literal_eval)

In [61]:
df_resume_rankings['projects'][0][0]['title']

'Project X'

Now, it is converted to list of dictionaries.

In [62]:
# Helper function to check if a skill is present in the project description
def skill_present(skill, description):
    # Use regular expression pattern matching to match the whole word
    pattern = r"\b" + re.escape(skill) + r"\b"
    return bool(re.search(pattern, description, re.IGNORECASE))


def vectorize_projects(df_resume_projects):
    # applicant_vectors = []
    # for applicant in applicant_projects:
    #     applicant_vector = [1 if any(skill.lower() in project['description'].lower() for project in applicant['projects']) else 0 for skill in skills]
    #     applicant_vectors.append(applicant_vector)

    target_job_skills = target_job['skills'].split(', ')
    applicant_vectors = []
    for applicant_projects in df_resume_projects:
        applicant_vector = []
        for skill in target_job_skills:
            found_in_projects = any(skill_present(skill.lower(), project['description'].lower()) for project in applicant_projects)
            applicant_vector.append(1 if found_in_projects else 0)
        applicant_vectors.append(applicant_vector)

    return applicant_vectors


In [63]:
# Convert applicant projects into binary feature vectors
df_resume_rankings['projects_vector'] = vectorize_projects(df_resume_rankings['projects'])

In [64]:
def get_projects_score():
    similarity_scores = []
    for applicant_vector in df_resume_rankings['projects_vector']:
        similarity = cosine_similarity(target_job['skills_vector'], applicant_vector)
        similarity_scores.append(similarity)
    df_resume_rankings['projects_score'] = similarity_scores

In [65]:
get_projects_score()

In [66]:
df_resume_rankings[['projects', 'projects_vector', 'projects_score']]

Unnamed: 0,projects,projects_vector,projects_score
0,"[{'title': 'Project X', 'description': 'Create...","[1, 0, 0, 0, 0]",0.447214
1,"[{'title': 'Project A', 'description': 'Develo...","[1, 0, 0, 0, 0]",0.447214
2,"[{'title': 'Project Z', 'description': 'Design...","[1, 0, 0, 1, 0]",0.632456
3,"[{'title': 'Project K', 'description': 'Create...","[0, 1, 0, 0, 0]",0.447214
4,"[{'title': 'Project K', 'description': 'Create...","[0, 0, 0, 0, 0]",0.0


## 3. Education

#### How it works?
It considers two factors seperately: degree and field of study.

1. Degree
- Values are assigned to different degree levels. Eg: Bachelor=1, Master=2, PhD=3
- A lower degree score indicates better similarity, with 0 being the best possible score.
- Normalize the score to be between 0 and 1 and adjust such that a higher score indicates better similarity.

2. Field of Study
- measures the text similarity between the applicant's field of study and the job's required field of study.
3. Combining Degree and Field Scores
The degree score is weighted by 0.7, and the field score is weighted by 0.3.


In [67]:
from difflib import SequenceMatcher

# Define a mapping for degree levels
degree_mapping = {
    'Bachelor': 1,
    'Master': 2,
    'PhD': 3,
}

# Function to calculate text similarity using SequenceMatcher
def calculate_education_similarity(str1, str2):
    return SequenceMatcher(None, str1.lower(), str2.lower()).ratio()


# Function to extract the field of study from education
def extract_field_of_study(education):
    fields_of_study = education.split('in', maxsplit=1)
    if len(fields_of_study) > 1:
        return fields_of_study[1].strip()
    return None


# Function to extract degree level and field of study from education
def extract_education_info(education):
    degree = None
    field_of_study = None
    
    for degree_name, degree_value in degree_mapping.items():
        if degree_name.lower() in education.lower():
            degree = degree_value
            break
    
    field_of_study = extract_field_of_study(education)
    
    return (degree, field_of_study)

In [68]:
# Convert job description education into degree level and field of study
target_job['education_vector'] = extract_education_info(target_job['education'])

# Convert applicant's education into degree level and field of study
df_resume_rankings['education_vector'] = df_resume_rankings['education'].apply(extract_education_info)

def get_education_score():
    applicant_scores = []
    for applicant_education in df_resume_rankings['education_vector']:
        applicant_degree, applicant_field_of_study = applicant_education
        job_degree, job_field_of_study = target_job['education_vector']
        degree_score = 0
        field_score = 0
        
        if applicant_degree and job_degree:
            if applicant_degree >= job_degree:
                degree_score = 0
            else:
                degree_score = job_degree-applicant_degree
        
        if applicant_field_of_study and job_field_of_study:
            field_score = calculate_education_similarity(applicant_field_of_study, job_field_of_study)

        # Set weights for degree and field_of_study
        degree_weight = 0.7
        field_weight = 0.3
        # smaller degree score means greater similarity (0 means exact)
        # the max_degree_score may be 2 (eg: phd vs bachelor)
        max_degree_score = 2 
        # By dividing (max_score - degree_score) by max_score, you normalize the score to be between 0 and 1, where a higher score indicates better similarity.
        degree_score = (max_degree_score - degree_score) / max_degree_score
        
        total_score = degree_weight * degree_score + field_weight * field_score
        applicant_scores.append(total_score)
    df_resume_rankings['education_score'] = applicant_scores


In [69]:
get_education_score()

In [70]:
target_job[['education', 'education_vector']]

education           Masters in IT
education_vector          (2, IT)
Name: 2, dtype: object

In [71]:
df_resume_rankings[['education', 'education_vector', 'education_score']]

Unnamed: 0,education,education_vector,education_score
0,Bachelor in Computer Engineering,"(1, Computer Engineering)",0.377273
1,Bachelor Degree in Computer Science,"(1, Computer Science)",0.383333
2,Masters in IT,"(2, IT)",1.0
3,PhD in Computer Science,"(3, Computer Science)",0.733333
4,Bachelor in Computer Engineering,"(1, Computer Engineering)",0.377273


## 4. Experience

- Extracts the 'X years' part from the target job experience.
- Calculates similarity scores for each applicant's experience, taking the ratio of their 'X years' to the target job's 'X years'. 
- The score is limited it to a maximum of 1.0 to avoid giving excessive credit to applicants with more experience than needed.

In [72]:
# Function to extract 'X years' part from experience
def extract_years(experience):
    match = re.search(r'\d+ years?', experience)
    if match:
        return int(match.group().split()[0])
    return 0

# Convert job experience to 'X years' information
target_job_experience = extract_years(target_job['experience'])

def get_experience_score():
    # Calculate similarity scores for each applicant
    applicant_scores = []
    for applicant_experience in df_resume_rankings['experience']:
        applicant_years = extract_years(applicant_experience)
        similarity_score = min(applicant_years / target_job_experience, 1.0)
        applicant_scores.append(similarity_score)

    df_resume_rankings['experience_score'] = applicant_scores

In [73]:
get_experience_score()

In [74]:
target_job['experience']

'4 years of marketing experience with a focus on social media marketing and SEO.'

In [75]:
df_resume_rankings[['experience', 'experience_score']]

Unnamed: 0,experience,experience_score
0,5 years of experience in software development ...,1.0
1,3 years of experience in data science and mach...,0.75
2,1 years of experience in project management an...,0.25
3,2 years of experience in mobile app developmen...,0.5
4,4 years of experience in financial analysis an...,1.0


#### Finally, so far

In [76]:
df_resume_rankings

Unnamed: 0,Category,Resume,education,skills,projects,experience,clean_resume,description_score,skills_vector,skills_score,projects_vector,projects_score,education_vector,education_score,experience_score
0,Java Developer,"Operating Systems Windows XP, 7, 10. Tools/Pac...",Bachelor in Computer Engineering,"C, Python, JavaScript","[{'title': 'Project X', 'description': 'Create...",5 years of experience in software development ...,operating systems windows xp tools pac...,0.215306,"[1, 0, 0, 0, 0]",0.447214,"[1, 0, 0, 0, 0]",0.447214,"(1, Computer Engineering)",0.377273,1.0
1,Java Developer,"Operating Systems Windows XP, 7, 10. Tools/Pac...",Bachelor Degree in Computer Science,"Python, Java, HTML, CSS","[{'title': 'Project A', 'description': 'Develo...",3 years of experience in data science and mach...,operating systems windows xp tools pac...,0.215306,"[1, 0, 0, 0, 0]",0.447214,"[1, 0, 0, 0, 0]",0.447214,"(1, Computer Science)",0.383333,0.75
2,Python Developer,â¢ Operating Systems: Windows â¢ Others: MS ...,Masters in IT,"C, Python, Java, SQL, Go","[{'title': 'Project Z', 'description': 'Design...",1 years of experience in project management an...,operating systems windows others ms ...,0.336201,"[1, 0, 0, 1, 0]",0.632456,"[1, 0, 0, 1, 0]",0.632456,"(2, IT)",1.0,0.25
3,Python Developer,â¢ Operating Systems: Windows â¢ Others: MS ...,PhD in Computer Science,"C, Python, Django, Pandas","[{'title': 'Project K', 'description': 'Create...",2 years of experience in mobile app developmen...,operating systems windows others ms ...,0.336201,"[1, 1, 0, 0, 0]",0.632456,"[0, 1, 0, 0, 0]",0.447214,"(3, Computer Science)",0.733333,0.5
4,Python Developer,â¢ Operating Systems: Windows â¢ Others: MS ...,Bachelor in Computer Engineering,"Python, JavaScript, ReactJS, Express, Node","[{'title': 'Project K', 'description': 'Create...",4 years of experience in financial analysis an...,operating systems windows others ms ...,0.336201,"[1, 0, 0, 0, 0]",0.447214,"[0, 0, 0, 0, 0]",0.0,"(1, Computer Engineering)",0.377273,1.0
