# Similarity analysis

## Main.py and processing

In [1]:
import os
from fastapi import FastAPI
from fastapi.encoders import jsonable_encoder
import pandas as pd
import json
from services.ResumeInfoExtraction import ResumeInfoExtraction
from services.JobInfoExtraction import JobInfoExtraction
from source.schemas.resumeextracted import ResumeExtractedModel
from source.schemas.jobextracted import JobExtractedModel
import ast
from pypdf import PdfReader
import warnings
warnings.filterwarnings("ignore")

def get_resumes(directory):
    
    def extract_pdf(path):
        reader = PdfReader(path)
        number_of_pages = len(reader.pages)
        text = ""
        for i in range(number_of_pages):
            page = reader.pages[i]
            text += page.extract_text()
        return text
    
    dic = {}
    
    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        
        if os.path.isfile(file_path) and filename.endswith(".pdf"):
            name = filename.strip(".pdf")
            resume_text = extract_pdf(file_path)
            dic[name] = [resume_text]
    
    df = pd.DataFrame(dic).T
    df.reset_index(inplace=True)
    df.rename(columns={"index": "name", 0:"raw"}, inplace=True)
    
    return df

def transform_dataframe_to_json(dataframe):

    # transforms the dataframe into json
    result = dataframe.to_json(orient="records")
    parsed = json.loads(result)
    json_data = json.dumps(parsed, indent=4)

    return json_data


def extraction(resume):
    degrees_patterns_path = 'Resources/data/degrees.jsonl'
    majors_patterns_path = 'Resources/data/majors.jsonl'
    skills_patterns_path = 'Resources/data/skills.jsonl'
    jobs = resume
    names = transform_dataframe_to_json(jobs[["name"]])
    job_extraction = ResumeInfoExtraction(skills_patterns_path, majors_patterns_path, degrees_patterns_path, jobs, names)
    jobs = job_extraction.extract_entities(jobs)
    for i, row in jobs.iterrows():
        name = row["name"]
        degrees = jobs.loc[i, 'Degrees']
        maximum_degree_level = jobs.loc[i, 'Maximum degree level']
        acceptable_majors = jobs.loc[i, 'Acceptable majors']
        skills = jobs.loc[i, 'Skills']
        

        job_extracted = ResumeExtractedModel(maximum_degree_level=maximum_degree_level if maximum_degree_level else '',
                                          acceptable_majors=acceptable_majors if acceptable_majors else [],
                                          skills=skills if skills else [],
                                          name=name if name else '',
                                          degrees=degrees if degrees else [])
        job_extracted = jsonable_encoder(job_extracted)
    jobs_json = transform_dataframe_to_json(jobs)
    
    return jobs_json

if __name__ == "__main__":
    df = get_resumes("resumes")
    res = extraction(df)
    df = pd.read_json(res)



  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 36 0 (offset 0)


In [2]:
df

Unnamed: 0,name,raw,Degrees,Maximum degree level,Acceptable majors,Skills
0,hanna_pedersen,EDUCATION Nova School of Business an...,"[MS-LEVEL, BS-LEVEL]",MS-LEVEL,"[data analysis, artificial intelligence, progr...","[business, analytics, business administration,..."
1,irene_abbateli,"\n \n \nIRENE ABBATELLI \nRome, Italy \n(+39)...","[MS-LEVEL, BS-LEVEL]",MS-LEVEL,[data analysis],"[business, analytics, data analysis, machine l..."
2,victor_bjorsvik,"VICTOR BJORSVIK \n \nLisbon, Portugal | +47 ...","[BS-LEVEL, MS-LEVEL]",MS-LEVEL,"[programming, data analysis, data science, web...","[accounting, big data, communications, busines..."


In [3]:
%%bash
ls job_descriptions/

description.txt


In [4]:
with open('job_descriptions/description.txt', 'r') as file:
    job_description = file.read()

job_description = [job_description]
df2 = pd.DataFrame(job_description, columns=["raw"])
df2

Unnamed: 0,raw
0,Data Scientist\nLisbon\n\n\n\nApply on employe...


In [5]:
def extraction(resume):
    degrees_patterns_path = 'Resources/data/degrees.jsonl'
    majors_patterns_path = 'Resources/data/majors.jsonl'
    skills_patterns_path = 'Resources/data/skills.jsonl'
    jobs = resume
    job_extraction = JobInfoExtraction(skills_patterns_path, majors_patterns_path, degrees_patterns_path, jobs)
    jobs = job_extraction.extract_entities(jobs)
    for i, row in jobs.iterrows():
        minimum_degree_level = jobs['Minimum degree level'][i]
        acceptable_majors = jobs['Acceptable majors'][i]
        skills = jobs['Skills'][i]

        job_extracted = JobExtractedModel(minimum_degree_level=minimum_degree_level if minimum_degree_level else '',
                                          acceptable_majors=acceptable_majors if acceptable_majors else [],
                                          skills=skills if skills else [])
        job_extracted = jsonable_encoder(job_extracted)
        # new_job_extracted = database.get_collection("jobsextracted").insert_one(job_extracted)
    jobs_json = transform_dataframe_to_json(jobs)
    return jobs_json

In [6]:
res = extraction(df2)
df2 = pd.read_json(res)
df2

Unnamed: 0,raw,Minimum degree level,Acceptable majors,Skills
0,Data Scientist\nLisbon\n\n\n\nApply on employe...,BS-LEVEL,"[data analysis, information technology, comput...","[finance, business, visualization, design, mac..."


In [7]:
df2.Skills[0]

['finance',
 'business',
 'visualization',
 'design',
 'machine learning',
 'modelling',
 'data analysis',
 'python',
 'marketing',
 'operations research',
 'database',
 'data quality',
 'analytics',
 'agile project management',
 'training model',
 'computer engineering',
 'relational database',
 'oracle',
 'collaboration',
 'communications',
 'data science']

In [8]:
df.Skills[2]

['accounting',
 'big data',
 'communications',
 'business',
 'analytics',
 'finance',
 'data science',
 'modelling',
 'business administration',
 'python',
 'logistic regression',
 'web development',
 'flask',
 'html',
 'css',
 'javascript',
 'testing',
 'training model',
 'pandas',
 'sql',
 'apache spark',
 'machine learning',
 'tensorflow',
 'keras',
 'c',
 'r',
 'tableau']

## Models and scoring

In [10]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import ast
#import openai
import time
import seaborn as sns
import matplotlib.pyplot as plt
import json

In [12]:
def semantic_similarity_sbert_base_v2(job,resume):
    """calculate similarity with SBERT all-mpnet-base-v2"""
    model = SentenceTransformer('all-mpnet-base-v2')
    #Encoding:
    score = 0
    sen = job+resume
    sen_embeddings = model.encode(sen)
    for i in range(len(job)):
        if job[i] in resume:
            score += 1
        else:
            if max(cosine_similarity([sen_embeddings[i]],sen_embeddings[len(job):])[0]) >= 0.4:
                score += max(cosine_similarity([sen_embeddings[i]],sen_embeddings[len(job):])[0])
    score = score/len(job)  
    return round(score,3)

In [13]:
def semantic_similarity_sbert_paraphrase_minilm_l6_v2(job,resume):
    """calculate similarity with SBERT paraphrase-MiniLM-L6-v2"""
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    #Encoding:
    score = 0
    sen = job+resume
    sen_embeddings = model.encode(sen)
    for i in range(len(job)):
        if job[i] in resume:
            score += 1
        else:
            if max(cosine_similarity([sen_embeddings[i]],sen_embeddings[len(job):])[0]) >= 0.4:
                score += max(cosine_similarity([sen_embeddings[i]],sen_embeddings[len(job):])[0])
    score = score/len(job)  
    return round(score,3)

In [55]:
from gensim.models import Word2Vec
# Load your pre-trained Skill2Vec model
# skill2vec_model = Word2Vec.load('Resources/data/skill2vec_50K/skill2vec_50K.csv')


skill2vec = pd.read_csv("Resources/data/skill2vec_50K/skill2vec_50K.csv")

In [56]:
skill2vec

Unnamed: 0,125720,HR Executive,screening,selection,Interview,HR,Recruiter,IT Recruiter,Sourcing,recruitment executive,...,Unnamed: 951,Unnamed: 952,Unnamed: 953,Unnamed: 954,Unnamed: 955,Unnamed: 956,Unnamed: 957,Unnamed: 958,Unnamed: 959,Unnamed: 960
0,112708,Special Teacher,Teaching,Education,,,,,,,...,,,,,,,,,,
1,115226,consulting,fresher,IT helpdesk,Techincal Troubleshooting,international voice,international BPO,technical support,outsourcing,call center,...,,,,,,,,,,
2,19805,diploma,machining,cnc m,mould,conventional machines,die making,knowledge,tool,cipet,...,,,,,,,,,,
3,80208,Compensation,Benefits,HR Functions,Alm,Payroll,ESS,Core HR,QC,QA,...,,,,,,,,,,
4,64086,Storage Administrator,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49994,27374,Chief Engineer,,,,,,,,,...,,,,,,,,,,
49995,88457,Receptionist Activities,Front Desk,front office,front desk executive,front office executive,receptionist,reception,,,...,,,,,,,,,,
49996,34590,SQL Queries,Log Analysis,Hardware Networking,People Leadership,Technical Skills,Unix,Oracle,Service Delivery Management,Database Administration,...,,,,,,,,,,
49997,86171,Quality Analyst,,,,,,,,,...,,,,,,,,,,


## Analysis

In [15]:
df

Unnamed: 0,name,raw,Degrees,Maximum degree level,Acceptable majors,Skills
0,hanna_pedersen,EDUCATION Nova School of Business an...,"[MS-LEVEL, BS-LEVEL]",MS-LEVEL,"[data analysis, artificial intelligence, progr...","[business, analytics, business administration,..."
1,irene_abbateli,"\n \n \nIRENE ABBATELLI \nRome, Italy \n(+39)...","[MS-LEVEL, BS-LEVEL]",MS-LEVEL,[data analysis],"[business, analytics, data analysis, machine l..."
2,victor_bjorsvik,"VICTOR BJORSVIK \n \nLisbon, Portugal | +47 ...","[BS-LEVEL, MS-LEVEL]",MS-LEVEL,"[programming, data analysis, data science, web...","[accounting, big data, communications, busines..."


In [17]:
df2

Unnamed: 0,raw,Minimum degree level,Acceptable majors,Skills
0,Data Scientist\nLisbon\n\n\n\nApply on employe...,BS-LEVEL,"[data analysis, information technology, comput...","[finance, business, visualization, design, mac..."


In [16]:
columns = ['applicant', 'job_id', 'all-mpnet-base-v2_score', 'paraphrase-MiniLM-L6-v2_score']
matching_dataframe = pd.DataFrame(columns=columns)
ranking_dataframe = pd.DataFrame(columns=columns)

In [42]:
from IPython.display import display, Markdown, Latex
display(Markdown(df2.iloc[0,0]))

Data Scientist
Lisbon



Apply on employer site


Apply on employer site
GROUP BNP PARIBAS

Present in more than 30 countries, BNP Paribas Personal Finance leads the personal and consumer credit business. In some markets we are leaders, and we bet on innovation to open up new opportunities, in others we are a fast growing business. In all markets, we value relationships, knowledge sharing and responsible action.

The analytical team uses state of the art data analytical and visualization techniques to solve business issues and uncover opportunities for BNP Paribas Person Finance. We design and develop our own ML engines, and we make them available to all PF geographies. We have vast amounts of data in the Bank, and we know that the value will come with robust and secure solutions deployed in production.

ABOUT THE JOB
MISSION

As a Data Scientist you will use data and ML models to improve our interactions with our customers. As part of this, you will :

Collect data (internal & external) and investigate their business value through data analysis.
Run python-based analytical engines (direct marketing, inbound marketing, web navigation) to build or run existing models.
Use Statistical methodologies (such as clustering) to give better insights of our customer database.
Develop & improve data quality controls and standardize processes and analyses
Ensure high quality of delivery
TEAM

The Mission is important, but so is the Team and the workplace!

Welcome to BNP Paribas Personal Finance, where you will integrate the Analytics Hub, an innovative, international and creative team based in Lisbon and Porto. Our working language is English, and we offer a hybrid work model.

You will report hierarchically to the Team Leader. The marketing side of the Hub has three mid-sized teams (Data Scientists, Data Analysts and Python developers), working together on complex challenges.

The team is recent, agile, and everyone is growing together. More than 20 colleagues are working in the same field as you, sharing ideas and experiences, and you will be able to share your insights with the central teams in Paris.

We are prepared to welcome you with an initial onboarding plan, with on the job training, online learning and networking opportunities.

REQUIREMENTS

A degree in Information Technology, Computer Engineering, Physics, Mathematics, Statistics or a related field
Strong programming skills in Python
Practical knowledge in data analysis, applied mathematics, and statistics
Fluency in oral and written English
Knowledge of relational databases such as vertica, oracle, sqlserver
Good skills on collaboration, communication, adaptability, assertiveness and initiative spirit
Interest in continuous learning and new challenges (machine learning, dev)
Availability for occasional travel in Europe
Experience
Some experience in Data Science projects is appreciated

Languages
English - Fluent

Our commitments

BNP Paribas is an equal opportunity employer that is proud to provide equal employment opportunities to all job seekers. As a socially responsible company, we incorporate the principles of Diversity and Inclusion in our values and practices.

To achieve all our goals, we intend to attract, develop, and retain different profiles, assuming diversity as an enabler and differentiator of innovation, fundamental in our organization.

What makes us proud as reference employer?

Top Employer Portugal and Top Employer Europe certification, for the seventh consecutive year;
92% of our employees identify BNP Paribas as a company with "an inclusive management that supports all kinds of differences (age, origins, sexual orientation ...)";
93% of employees identify with and benefit from the "Smart Working" policy, feeling comfortable in a hybrid work environment, and with the digital tools and workspaces available;
71% of our customers are promoters of our brand.

In [37]:
for job_index in range(df2.shape[0]):
    columns = ['applicant', 'job_id', 'all-mpnet-base-v2_score', 'paraphrase-MiniLM-L6-v2_score', 'Skill2Vec_score']
    matching_dataframe = pd.DataFrame(columns=columns)
    ranking_dataframe = pd.DataFrame(columns=columns)
    
    matching_data = []
    
    for applicant_id in range(df.shape[0]):
        matching_dataframe_job = {
            "applicant": df.iloc[applicant_id, 0],
            "job_id": job_index,
            "all-mpnet-base-v2_score": semantic_similarity_sbert_base_v2(df2['Skills'][job_index], df['Skills'][applicant_id]),
            "paraphrase-MiniLM-L6-v2_score": semantic_similarity_sbert_paraphrase_minilm_l6_v2(df2['Skills'][job_index], df['Skills'][applicant_id]),
            "Skill2Vec_score": skill2vec_similarity(df2['Skills'][job_index], df['Skills'][applicant_id])
        }
        matching_data.append(matching_dataframe_job)
    
    matching_dataframe = pd.concat([matching_dataframe, pd.DataFrame(matching_data)], ignore_index=True)

In [43]:
matching_dataframe

Unnamed: 0,applicant,job_id,all-mpnet-base-v2_score,paraphrase-MiniLM-L6-v2_score
0,hanna_pedersen,0,0.718,0.677
1,irene_abbateli,0,0.706,0.678
2,victor_bjorsvik,0,0.738,0.737
