In [1]:
import torch
import fitz
from transformers import DistilBertTokenizer, DistilBertModel
from torch.nn.functional import cosine_similarity
import os
import pandas as pd

In [2]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def extract_text_from_pdf(pdf_file):
    # Open the PDF file
    doc = fitz.open(pdf_file)
    text = ""
    # Extract text from each page
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text("text")
    return text

def get_text_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Take the mean of the hidden states to get sentence embeddings
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

def cosine_similarity_manual(embedding1, embedding2):
    # Compute cosine similarity manually
    return cosine_similarity(embedding1, embedding2).item()

def compare_resume_lor(resume_pdf, lor_txt_file):
    # Load resume (PDF) and LOR (txt) files
    resume_text = extract_text_from_pdf(resume_pdf)
    
    with open(lor_txt_file, 'r') as file:
        lor_text = file.read()

    # Get text embeddings for resume and LOR
    resume_embeddings = get_text_embeddings(resume_text, tokenizer, model)
    lor_embeddings = get_text_embeddings(lor_text, tokenizer, model)
    
    # Compute cosine similarity between resume and LOR
    similarity = cosine_similarity_manual(resume_embeddings, lor_embeddings)
    
    # Interpret similarity score
    return similarity

In [3]:
reco_dir="C:/Users/ojasm/Downloads/INNOV8-2.0-Finals-main/INNOV8-2.0-Finals-main/Dataset/Final_Recommendation_Letters"
cv_dir="C:/Users/ojasm/Downloads/INNOV8-2.0-Finals-main/INNOV8-2.0-Finals-main/Dataset/Final_Resumes"

In [5]:
df=pd.read_csv("C:/Users/ojasm/Downloads/FINAllllllll.csv")
list_=[]
for i in os.listdir(cv_dir):
    identifier = i[13:-4]
    cv_pdf_path=os.path.join(cv_dir+"/"+i)
    reco_path=os.path.join(reco_dir+"/"+"Recommendation_Letters_of_ID_"+i[13:-4])
    try:
        overall_=0
        count=0
        for j in os.listdir(reco_path):
            lor_txt_path=os.path.join(reco_path+"/"+j)
            result = compare_resume_lor(cv_pdf_path, lor_txt_path)
            overall_+=result
            count+=1
        similarity_score=overall_/count
        print(similarity_score)
        list_.append(similarity_score)
        print(len(list_))
        df.loc[df['ID'] == identifier, 'similarity_score'] = similarity_score
    except:
        pass

0.8916672468185425
1
0.8504253476858139
2
0.9023335576057434
3
0.8724178075790405
4
0.8759353756904602
5
0.8654346168041229
6
0.8643427193164825
7
0.8950602412223816
8
0.8748337179422379
9
0.863372415304184
10
0.8711957931518555
11
0.8757458130518595
12
0.8362785577774048
13
0.8376553952693939
14
0.8609067797660828
15
0.8847638517618179
16
0.8910247683525085
17
0.8452140688896179
18
0.8827096422513326
19
0.8708735770649381
20
0.8684097528457642
21
0.840105414390564
22
0.8914007663726806
23
0.8963165283203125
24
0.8690292537212372
25
0.9017884135246277
26
0.7027454972267151
27
0.8206374943256378
28
0.8971022367477417
29
0.8691570361455282
30
0.8365508317947388
31
0.8631131947040558
32
0.8178698793053627
33
0.8961756706237793
34
0.8350711911916733
35
0.9005799889564514
36
0.896591454744339
37
0.8683952987194061
38
0.8713889817396799
39
0.888187026977539
40
0.8767268359661102
41
0.8566340506076813
42
0.8643835783004761
43
0.8370620948927743
44
0.884259045124054
45
0.8847631216049194
46
0.

In [6]:
df

Unnamed: 0,ID,Recommenders ID,Total Number of LORs,Total Valid LORs,similarity_score
0,0,"[218, 391]",2,,
1,1,"[412, 869, 233, 289]",4,3.0,
2,2,"[582, 624, 592, 662, 469]",5,3.0,
3,3,"[194, 122]",2,1.0,
4,4,"[763, 726, 589, 977, 950, 543, 30]",7,6.0,
...,...,...,...,...,...
959,995,"[51, 290, 370]",3,2.0,
960,996,"[254, 890, 296]",3,2.0,
961,997,"[850, 228, 194]",3,2.0,
962,998,"[258, 471]",2,1.0,
