## 0. Import

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip install -U sentence-transformers
!pip install openpyxl 

In [3]:
from torch.utils.data import DataLoader
import torch
import math
from zipfile import ZipFile
import json 
from sentence_transformers import SentenceTransformer,  SentencesDataset, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryClassificationEvaluator
from sentence_transformers.readers import STSBenchmarkDataReader, InputExample
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sklearn.metrics.pairwise import paired_cosine_distances
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import pandas as pd
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

## 1. Prepair data_test

In [23]:
data = pd.read_excel("/content/drive/MyDrive/Akatsuki_T2/BERT/data_test_set.xlsx")
data

Unnamed: 0,id,sentences 1,sentences 2,Position,Score,split,Expect Position top 1,Unnamed: 7
0,1,ai engineer 06/2020 - hiện tại 1.autopilot pr...,job overview bap software co ltd 10f 81 quang ...,ADMINISTRATIVE OFFICER,0.0,train,AI ENGINEER,INTERN _ FRESHER
1,1,ai engineer 06/2020 - hiện tại 1.autopilot pr...,bap software co ltd 10f 81 quang trung st hai ...,AI ENGINEER,4.0,train,AI ENGINEER,INTERN _ FRESHER
2,1,ai engineer 06/2020 - hiện tại 1.autopilot pr...,job overview bap software co ltd 10f 81 quang ...,ANDROID DEV,0.0,train,AI ENGINEER,INTERN _ FRESHER
3,1,ai engineer 06/2020 - hiện tại 1.autopilot pr...,bap software co ltd 10f 81 quang trung st hai ...,BLOCKCHAIN DEVELOPER,0.0,train,AI ENGINEER,INTERN _ FRESHER
4,1,ai engineer 06/2020 - hiện tại 1.autopilot pr...,bap software co ltd 10f 81 quang trung st hai ...,C# DEV,0.0,train,AI ENGINEER,INTERN _ FRESHER
...,...,...,...,...,...,...,...,...
1394,100,education duy tan university major : finance b...,job overview bap software co ltd 10f 81 quang ...,NODE JS,0.0,train,JAVA DEV,FRONTEND DEV
1395,100,education duy tan university major : finance b...,job overview bap software co ltd 10f 81 quang ...,PHP DEV,0.0,train,JAVA DEV,FRONTEND DEV
1396,100,education duy tan university major : finance b...,bap software co ltd 10f 81 quang trung st hai ...,PYTHON DEVELOPER,0.0,train,JAVA DEV,FRONTEND DEV
1397,100,education duy tan university major : finance b...,job overview bap software co ltd 10f 81 quang ...,QC_ TESTER,0.0,train,JAVA DEV,FRONTEND DEV


In [None]:
data['sentences 1'].unique()

In [24]:
sentences_pair = {}

for i in range(len(pd.unique(data['id']))):
    sentences_pair[i] = {
        'CV' : [],
        'JD' : [],
        'score' : [], 
        'Position' : [],
        'Expect Position top 1' : [],
        'Expect Position top 2' : []

    }
    # Get unique sentence 1
    data_ = data.loc[lambda x: x['id'] == pd.unique(data['id'])[i]]
    # Get List of "pair" sentence 1 and sentence 2
    data_.apply(lambda x: sentences_pair[i]['CV'].append(x['sentences 1']), axis=1)
    data_.apply(lambda x: sentences_pair[i]['JD'].append(x['sentences 2']), axis=1)
    data_.apply(lambda x: sentences_pair[i]['Position'].append(x['Position']), axis=1)
    data_.apply(lambda x: sentences_pair[i]['Expect Position top 1'].append(x['Expect Position top 1']), axis=1)
    data_.apply(lambda x: sentences_pair[i]['Expect Position top 2'].append(x['Unnamed: 7']), axis=1)

## 3. Load model and calculate cosine scores

In [6]:
device = "cuda"
model = SentenceTransformer(model_name_or_path='/content/drive/MyDrive/Akatsuki_T2/BERT/BERT_step_2_(model)', device=device)

In [14]:
sentence = "experience using maven . experience using html , css , bootstrap , jquery , ajax , thymeleaf"

In [19]:
sentence_embedding = model.encode(sentences=sentence, batch_size=8)
sentence_embedding.shape

(768,)

In [7]:
def cal_cosine_score(sentences1: List[str], sentences2: List[str], batch_size: int = 8, show_progress_bar: bool = False):
    """
    :param sentences1: List with the first sentence in a pair
    :param sentences2: List with the second sentence in a pair
    """
    embeddings1 = model.encode(sentences1, batch_size=batch_size, show_progress_bar=show_progress_bar, convert_to_numpy=True)
    embeddings2 = model.encode(sentences2, batch_size=batch_size, show_progress_bar=show_progress_bar, convert_to_numpy=True)

    cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
    return cosine_scores.tolist()

In [None]:
for item in sentences_pair:
    sentences_pair[item]['score'] = cal_cosine_score(sentences1 = sentences_pair[item]['CV'], 
                                                     sentences2 = sentences_pair[item]['JD'],
                                                     show_progress_bar = True)

In [28]:
list_top_3_score = []
for index in range(len(sentences_pair)):
    sentences_pair_ = pd.DataFrame.from_dict(sentences_pair[index])
    top_3_score = sentences_pair_.sort_values(by=['score'], ascending = False).head(3)
    top_3_score_position_only = sentences_pair_.sort_values(by=['score'], ascending = False).head(3)[['Position', 'score', 'Expect Position top 1', 'Expect Position top 2']]
    # print(top_3_score_position_only)
    top_3_score = top_3_score_position_only.to_dict('records')
    list_top_3_score.append(top_3_score)

count_top1 = 0
count_top2 = 0
for score in list_top_3_score:
    # print(len(score))
    # break
    if score[0]['Position'] == score[0]['Expect Position top 1']:
        count_top1 += 1
    if score[1]['Position'] == score[1]['Expect Position top 2']:
        count_top2 += 1

print("Acc top 1 : {}, acc top 2 : {}".format(count_top1, count_top2))

Acc top 1 : 85, acc top 2 : 37
