In [1]:
import joblib 
import pickle
import pandas as pd
import numpy as np

In [2]:
loaded_model = joblib.load('tagging_model.pkl')

In [3]:
# Open the file for reading
# Let's Read the users_avialable file.
with open('users_available.pkl', 'rb') as f:
    # Load the list from the file
    loaded_users_available = pickle.load(f)

In [18]:
sample_transaction = pd.DataFrame({'User': ['User2'],
    'Transaction': ['IKEA INDIA PVT L']})

In [19]:
sample_transaction

Unnamed: 0,User,Transaction
0,User2,IKEA INDIA PVT L


In [20]:
## Let us try with all-MiniLM-L6-v2
from sentence_transformers import SentenceTransformer

In [21]:
# Lets get the embeddings from the sentances
sentences = sample_transaction['Transaction'].to_list()
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
inference_dataset = pd.concat([sample_transaction,pd.DataFrame(embeddings)],axis=1)
# print(embeddings)

In [22]:
inference_dataset

Unnamed: 0,User,Transaction,0,1,2,3,4,5,6,7,...,374,375,376,377,378,379,380,381,382,383
0,User2,IKEA INDIA PVT L,-0.078475,0.011922,-0.015605,8.5e-05,0.01587,0.040188,0.032911,0.035053,...,-0.032525,0.024515,-0.028897,0.046615,-0.009152,0.053157,0.072258,-0.053432,-0.093978,0.053792


In [23]:
# What this function basically does is that it ensures that when there is a new user which is not seen at the 
# time of training it replaces that with -1.
inference_dataset['User_id'] = inference_dataset['User'].apply(lambda x:x.replace('User','') if x in loaded_users_available else -1)

In [24]:
# Column names can't be of mixed datatype therefore change it to string 
num_dimensions = 384
column_mapping = {i: f'Feature{i}' for i in range(num_dimensions+1)}
inference_dataset = inference_dataset.rename(columns=column_mapping)

In [25]:
predictor_point = inference_dataset.drop(columns=['User','Transaction'])

In [26]:
predictor_point

Unnamed: 0,Feature0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,...,Feature375,Feature376,Feature377,Feature378,Feature379,Feature380,Feature381,Feature382,Feature383,User_id
0,-0.078475,0.011922,-0.015605,8.5e-05,0.01587,0.040188,0.032911,0.035053,0.027151,-0.003045,...,0.024515,-0.028897,0.046615,-0.009152,0.053157,0.072258,-0.053432,-0.093978,0.053792,2


In [27]:
# prediction top 3 tags 
y_proba = loaded_model.predict_proba(predictor_point)

In [28]:
# Sort By values
top_three = np.argsort(y_proba, axis=1)[:, -3:]
top_three_labels = np.array([loaded_model.classes_[i] for i in top_three])
top_three_probs = np.sort(y_proba, axis=1)[:, -3:]
top_three_with_probs = [sorted(zip(labels, probs), key=lambda x: x[1], reverse=True) for labels, probs in zip(top_three_labels, top_three_probs)]

In [29]:
top_three_with_probs

[[('Shopping', 0.7585190287567656),
  ('Travel', 0.11824421107613903),
  ('Medical', 0.05227968806902819)]]