# Matching using Cosine similariy algorithm

In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets of end-of-study internships (PFE) and resumes for students
internships_offers = pd.read_excel('pfe_dataset.xlsx')
cvs = pd.read_csv('cv_dataset.csv')

# Remove whitespace and split skills column in both DataFrames
internships_offers["Skills"] = internships_offers["Skills"].apply(lambda x: x.split(",") if isinstance(x, str) else [])
cvs["skills"] = cvs["skills"].apply(lambda x: x.split(",") if isinstance(x, str) else [])

# Combine skills into a single string for each offer and resume
internships_offers["Skills_str"] = internships_offers["Skills"].apply(lambda x: " ".join(x))
cvs["skills_str"] = cvs["skills"].apply(lambda x: " ".join(x))

# Use CountVectorizer to create a matrix of skill frequencies for each offer and resume
count_vect = CountVectorizer()
offer_skill_matrix = count_vect.fit_transform(internships_offers["Skills_str"])
cv_skill_matrix = count_vect.transform(cvs["skills_str"])

# Calculate the cosine similarity matrix
cosine_similarity_matrix = cosine_similarity(offer_skill_matrix, cv_skill_matrix)

# Sort the similarity matrix by similarity score
sorted_cosine_similarity_matrix = cosine_similarity_matrix.argsort(axis=1)[:, ::-1]

# Print the top 5 matches for each internship offer
for i, row in enumerate(sorted_cosine_similarity_matrix):
    print("The top 5 matches for internship offer {} are:".format(internships_offers.iloc[i]["Title"]))
    for j in range(5):
        similarity_score = cosine_similarity_matrix[i, row[j]]
        print("  - CV {} with similarity score {}".format(row[j], similarity_score))


The top 5 matches for internship offer DÉVELOPPEMENT D’UN SYSTÈME D'INFO DIVERTISSEMENT SUR UNE IMAGE AGL (AUTOMOTIVE GRADE LINUX) are:
  - CV 5580 with similarity score 0.0
  - CV 1862 with similarity score 0.0
  - CV 1854 with similarity score 0.0
  - CV 1855 with similarity score 0.0
  - CV 1856 with similarity score 0.0
The top 5 matches for internship offer DÉVELOPPEMENT D’UN OUTIL DE DIAGNOSTIC AVEC INTERFACE GRAPHIQUE PERMETTANT DE COMMUNIQUER AVEC UN CALCULATEUR AUTOMOBILE EMBARQUÉ DE TÉLÉMATIQUE (TCU) are:
  - CV 3993 with similarity score 0.14433756729740646
  - CV 1758 with similarity score 0.13608276348795437
  - CV 4067 with similarity score 0.13608276348795437
  - CV 4154 with similarity score 0.13245323570650439
  - CV 4083 with similarity score 0.13245323570650439
The top 5 matches for internship offer DÉVELOPPEMENT D’UNE SOLUTION DE DÉBOGAGE OTA are:
  - CV 5176 with similarity score 0.5773502691896258
  - CV 4931 with similarity score 0.5773502691896258
  - CV 3954 wi

  - CV 5215 with similarity score 0.35355339059327384
  - CV 5501 with similarity score 0.3333333333333334
  - CV 5369 with similarity score 0.2886751345948129
  - CV 5486 with similarity score 0.2886751345948129
The top 5 matches for internship offer Developing an application for Mobility workflow are:
  - CV 5020 with similarity score 0.7745966692414834
  - CV 5365 with similarity score 0.7745966692414834
  - CV 5396 with similarity score 0.7745966692414834
  - CV 4794 with similarity score 0.7745966692414834
  - CV 4609 with similarity score 0.7745966692414834
The top 5 matches for internship offer Digital Pre-onboarding and Onboarding are:
  - CV 4893 with similarity score 0.2182178902359924
  - CV 4886 with similarity score 0.2182178902359924
  - CV 4726 with similarity score 0.2182178902359924
  - CV 4719 with similarity score 0.1889822365046136
  - CV 4725 with similarity score 0.1889822365046136
The top 5 matches for internship offer Dynamic HR Dashboard are:
  - CV 4893 with s

# Matching using BERT algorithm

In [34]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel



# Load the datasets of end-of-study internships (PFE) and resumes for students

def find_top_matches(cv_skills, model):
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    internships_offers = pd.read_excel('pfe_dataset.xlsx')
    internships_offers["Skills"] = internships_offers["Skills"].apply(lambda x: x.split(",") if isinstance(x, str) else [])
    internships_offers["Skills_str"] = internships_offers["Skills"].apply(lambda x: " ".join(x))
    # Compute the BERT embedding for the skills of the given resume
    cv_skill_embeddings = model(**tokenizer(cv_skills, padding=True, truncation=True, return_tensors="pt")).pooler_output[0].detach().numpy()

    # Calculate the cosine similarity scores
    cosine_similarity_scores = []
    for i in range(len(internships_offers)):
        offer_skill_embeddings = model(**tokenizer(internships_offers["Skills_str"][i], padding=True, truncation=True, return_tensors="pt")).pooler_output[0].detach().numpy()
        cosine_similarity_scores.append(np.dot(offer_skill_embeddings, cv_skill_embeddings.T) / (np.linalg.norm(offer_skill_embeddings) * np.linalg.norm(cv_skill_embeddings)))

    # Sort the similarity scores by descending order
    top_matches_indices = np.argsort(cosine_similarity_scores)[::-1][:5]

    # Print the top 5 matches with the job title and company
    print("The top 5 matches for the given CV are:")
    for i in top_matches_indices:
        job_title = internships_offers.iloc[i]["Title"]
        company = internships_offers.iloc[i]["CompanyName"]
        similarity_score = cosine_similarity_scores[i]
        print(f"- {job_title} at {company} with similarity score {similarity_score}")


In [35]:
model = AutoModel.from_pretrained('bert-base-uncased')
cv_skills = "Python, Machine Learning, Data Analysis, Problem Solving"
find_top_matches(cv_skills, model)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The top 5 matches for the given CV are:
- Bounding box labeling on internal website at DRAXLMAIER with similarity score 0.9966569542884827
- Object tracking for production settings at DRAXLMAIER with similarity score 0.9966569542884827
- UX/UI MOBILE APPLICATION MYACP at AxeFinance with similarity score 0.9963304996490479
- PROOF OF CONCEPT (PROTOTYPE) OF A V2V COMMUNICATION SOLUTIONS / V2V NETWORK at Capgemini with similarity score 0.9960061311721802
- MLOPS - CREDIT SCORING MODELING AND MONITORING at AxeFinance with similarity score 0.9958928823471069


In [43]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-win_amd64.whl (977 kB)
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py): started
  Building wheel for sentence-transformers (setup.py): finished with status 'done'
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=ebb6de09fe678b8e2d68e441a0eacec07008e64fefbcab7d432af690074212d5
  Stored in directory: c:\users\barki\appdata\local\pip\cache\wheels\71\67\06\162a3760c40d74dd40bc855d527008d26341c2b0ecf3e8e11f
Successfully built sentence-transformers
Installing collected packages: sentencepiece, sentence-transformers
Successfully installed sentence-transformers-2.2.2 sentencepiece-0.1.98
Note: you may need to restart the kernel to use updated packages.




In [5]:
df

Unnamed: 0,Title,Description,Skills,Number of Interns,Duration,publication_date,Location,CompanyName
0,DÉVELOPPEMENT D’UN SYSTÈME D'INFO DIVERTISSEME...,IMPLÉMENTATION DES FONCTIONNALITÉS ( BLUETOOTH...,"C, BASH, YOCTO",1,4 À 6,2023-01-05,ARIANA,ACTIA
1,DÉVELOPPEMENT D’UN OUTIL DE DIAGNOSTIC AVEC IN...,INTERFACE GRAPHIQUE PERMETTANT DE COMMUNIQUER ...,"PYTHON, C, BASH, YOCTO",1,4 À 6,2023-01-22,ARIANA,ACTIA
2,DÉVELOPPEMENT D’UNE SOLUTION DE DÉBOGAGE OTA,APPLICATION WEB ET D’UN PROGRAMME C (QUI SERA ...,"WEB (PHP, HTML,...), C",1,4 À 6,2022-12-21,ARIANA,ACTIA
3,GPS NMEA GENERATOR FOR TRACKING SYSTEM VIRTUAL...,IL S'AGIT DE DEVELOPPER UN SIMULTAUER GPS L'AP...,"LINUX, C, C++, QT",1,4 À 6,2022-12-02,ARIANA,ACTIA
4,CONCEPTION ET DÉVELOPPEMENT D'UN SYSTÈME D'ÉLE...,L'ANALYSE ET CONCEPTION DE LA SOLUTION. - CONV...,"C EMBARQUÉ, CHORA, STRUCTURED TEXT",1,4 À 6,2023-01-14,ARIANA,ACTIA
...,...,...,...,...,...,...,...,...
551,Rédaction web pour le site de Stark Solutions,Proposition des améliorations sur le site web ...,"Marketing digital , ERP",1,4 à 6,2023-01-05,Sfax,Spark-it
552,Référencement naturel du site-web de Stark Sol...,Elaboration des différentes étapes du référenc...,"Marketing digital ,Référencement naturel ,SEO",1,4 à 6,2023-01-06,Sfax,Spark-it
553,Social media management de Stark Solutions,Développement de la notoriété et l'image de ma...,"Marketing digital , Social media management , ...",1,4 à 6,2023-01-30,Sfax,Spark-it
554,Proposition et conception d'une spécification ...,Etude des besoins fonctionnels d'un ERP pour l...,"Scrum , Analyse fonctionnelle , gestion de pro...",1,4 à 6,2022-12-19,Sfax,Spark-it
