In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import itertools
import pickle
from collections import defaultdict

In [2]:
list1 = pd.read_csv('search_titles.csv')
list2 = pd.read_csv('job_titles.csv',header=None)
list2.columns = ['job_title']

#### SBERT

In [3]:
class SemanticJobMatcher:
    """
    Semantic Job Matcher is created with a fit and predict method.
    The fit method takes two input lists. The method 
    """
    
    
    def __init__(self, topn=3):
        self.model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
        self.topn = topn
    def fit(self, list1, list2):
        self.embedded_titles = [self.model.encode(title,show_progress_bar=True) for title in [list1, list2]]
        self.permutations = pd.DataFrame(list(itertools.product(list1,list2)),columns=['SearchTitle','JobTitle'])
        
        
    def predict(self, topn=3):
        self.permutations['cosine_scores'] = cosine_similarity(self.embedded_titles[0],self.embedded_titles[1]).reshape(-1,1)
        self.permutations['ranking'] = self.permutations.groupby('SearchTitle')['cosine_scores'].rank(method='dense', ascending=False)
        
        filtered =  self.permutations[self.permutations['ranking']<=self.topn].sort_values(['SearchTitle','ranking'])
        titles_dict =  filtered.pivot_table('JobTitle',
                 index='SearchTitle',
                 columns='ranking',
                 aggfunc='max').to_dict('index')
        scores_dict =  filtered.pivot_table('cosine_scores',
                 index='SearchTitle',
                 columns='ranking',
                 aggfunc='max').to_dict('index')
        
        
        output = defaultdict(list)

        for d in (titles_dict, scores_dict):
            for key, value in d.items():
                output[key].append(value)
                
        return output

        

In [4]:
model = SemanticJobMatcher()
model.fit(list(list1['job_title'].iloc[0:100]),list(list2['job_title'].iloc[0:1500]))

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/47 [00:00<?, ?it/s]

In [None]:
perms = model.predict(topn=3)

In [None]:
perms