In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import itertools
import pickle
from collections import defaultdict

In [2]:
list1 = pd.read_csv('List1.csv')
list2 = pd.read_csv('List II.csv',header=None)
list2.columns = ['job_title']

#### SBERT

In [3]:
class SemanticJobMatcher:
    def __init__(self):
        self.model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
    def fit(self, list1, list2):
        self.embedded_titles = [self.model.encode(title,show_progress_bar=True) for title in [list1, list2]]
        self.permutations = pd.DataFrame(list(itertools.product(list1,list2)),columns=['SearchTitle','JobTitle'])
        
        
    def predict(self, topn=3):
        self.permutations['cosine_scores'] = cosine_similarity(self.embedded_titles[0],self.embedded_titles[1]).reshape(-1,1)
        self.permutations['ranking'] = self.permutations.groupby('SearchTitle')['cosine_scores'].rank(method='dense', ascending=False)
        
        filtered =  self.permutations[self.permutations['ranking']<=topn].sort_values(['SearchTitle','ranking'])
        titles_dict =  filtered.pivot_table('JobTitle',
                 index='SearchTitle',
                 columns='ranking',
                 aggfunc='max').to_dict('index')
        scores_dict =  filtered.pivot_table('cosine_scores',
                 index='SearchTitle',
                 columns='ranking',
                 aggfunc='max').to_dict('index')
        
        
        output = defaultdict(list)

        for d in (titles_dict, scores_dict):
            for key, value in d.items():
                output[key].append(value)
                
        return output

        

In [4]:
model = SemanticJobMatcher()
model.fit(list(list1['job_title'].iloc[0:100]),list(list2['job_title'].iloc[0:1500]))

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/47 [00:00<?, ?it/s]

In [5]:
perms = model.predict(topn=3)

In [6]:
perms

defaultdict(list,
            {'accounting intern': [{1.0: 'Accounting Intern',
               2.0: 'Accounting Services Intern',
               3.0: 'Accounting Department Intern'},
              {1.0: 0.9999998807907104,
               2.0: 0.9752693772315979,
               3.0: 0.9654386043548584}],
             'advanced account manager': [{1.0: 'Advanced Technology Account Executive',
               2.0: 'Account Development Manager',
               3.0: 'Account Support Analyst'},
              {1.0: 0.890179455280304,
               2.0: 0.8751572370529175,
               3.0: 0.8728467226028442}],
             'affiliate account manager': [{1.0: 'Affiliate Account Manager',
               2.0: 'Affiliate Marketing Manager',
               3.0: 'Affiliate Sales Manager'},
              {1.0: 0.9999998807907104,
               2.0: 0.8923882246017456,
               3.0: 0.8623831272125244}],
             'allocation analyst': [{1.0: 'Allocation Analyst',
               2.0: 'Ap