In [1]:
import pandas as pd 
import numpy as np
import random
import torch
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sentence_transformers import InputExample
from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader



In [2]:
import random
import numpy as np
import torch

SEED = 42

# Python built-in RNG
random.seed(SEED)

# Numpy RNG
np.random.seed(SEED)

# PyTorch RNG
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Ensure deterministic operations in PyTorch
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


In [3]:
# finetuning for ambiguous terms  
ambiguous_terms = pd.read_csv("data/ambiguous_terms.csv", header=0) 
X_text = ambiguous_terms['name'].tolist()
y = ambiguous_terms['category'].tolist()
le = LabelEncoder()
y_encoded = le.fit_transform(y)
num_classes = len(le.classes_)
print(le.classes_)

['entertainment' 'grocery' 'health & wellness' 'restaurant' 'shopping'
 'transportation']


In [4]:
train_examples = [InputExample(texts=[text,""], label=label) for text, label in zip(X_text, y_encoded)]

In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=num_classes
)

In [6]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=10
)


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
df = pd.read_csv("data/name_price.csv", header=0) 
# df.head()
df_X = df['name'].tolist()
df_y = df['category'].tolist()
df_le = LabelEncoder()
df_X_embedded = model.encode(df_X, convert_to_numpy=True)  # Embeddings from finetuned SBERT
df_y_encoded = df_le.fit_transform(df_y)

clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=SEED)
clf.fit(df_X_embedded, df_y_encoded)
print(df_le.classes_)

['bills' 'entertainment' 'grocery' 'health & wellness' 'restaurant'
 'shopping' 'transportation']


In [8]:
def soft_pred(name): 
    embedding = model.encode([name])
    proba = clf.predict_proba(embedding)[0]
    
    # Get top 3 indices sorted by probability (descending)
    top3_idx = np.argsort(proba)[-3:][::-1]
    top_class = df_le.classes_[top3_idx[0]]
    top_proba = proba[top3_idx[0]]

    # Get runner-up classes (2nd and 3rd highest)
    runner_ups = [(df_le.classes_[i], proba[i]) for i in top3_idx[1:]]

    # Format result
    result = f"→ Predicted category: {top_class} ({top_proba:.2f})"
    if runner_ups:
        others_formatted = ", ".join([f"{cls} ({p:.2f})" for cls, p in runner_ups])
        result += f" | Other possible classes: {others_formatted}"
    
    return result


In [38]:
# try it out! 
# print(soft_pred("trattoria taverniti"))
# print(soft_pred("foam roller"))
# print(soft_pred("supermarket"))
print(soft_pred("uniqlo"))

→ Predicted category: restaurant (0.26) | Other possible classes: shopping (0.22), grocery (0.20)


In [10]:
def soft_pred_testing(name, margin=0.7): 
    embedding = model.encode([name])
    proba = clf.predict_proba(embedding)[0]
    
    # Get top two indices and their probabilities
    top2_idx = np.argsort(proba)[-2:][::-1]
    p1, p2 = proba[top2_idx[0]], proba[top2_idx[1]]
    c1, c2 = df_le.classes_[top2_idx[0]], df_le.classes_[top2_idx[1]]
    
    if p1 - p2 >= margin:
        return f"→ Predicted category: {c1} ({p1:.2f})"
    else:
        return f"→ Ambiguous. Top 2: {c1} ({p1:.2f}), {c2} ({p2:.2f})"


In [11]:
print(soft_pred_testing("osteria giulia"))
print(soft_pred_testing("freshway foodmart"))
print(soft_pred_testing("basketball"))
print(soft_pred_testing("basketball drop in"))
print(soft_pred_testing("physio"))
print(soft_pred_testing("physiology"))
print(soft_pred_testing("uber"))
print(soft_pred_testing("uber eats"))
print(soft_pred_testing("Subway"))
print(soft_pred_testing("metro"))


→ Ambiguous. Top 2: restaurant (0.36), grocery (0.15)
→ Ambiguous. Top 2: grocery (0.48), restaurant (0.28)
→ Ambiguous. Top 2: health & wellness (0.28), entertainment (0.23)
→ Ambiguous. Top 2: health & wellness (0.47), entertainment (0.14)
→ Ambiguous. Top 2: health & wellness (0.40), grocery (0.14)
→ Ambiguous. Top 2: health & wellness (0.56), shopping (0.10)
→ Ambiguous. Top 2: transportation (0.73), grocery (0.06)
→ Ambiguous. Top 2: transportation (0.34), restaurant (0.26)
→ Ambiguous. Top 2: transportation (0.57), restaurant (0.17)
→ Ambiguous. Top 2: transportation (0.41), grocery (0.36)


In [12]:
def print_all_probs(name):
    embedding = model.encode([name])
    proba = clf.predict_proba(embedding)[0]
    
    # top5_idx = np.argsort(proba)[-5:][::-1]
    
    result = [f"{df_le.classes_[i]} ({proba[i]:.2f})" for i in np.argsort(proba)[::-1]]
    return "→ predictions: " + ", ".join(result)

print(print_all_probs("freshway foodmart"))
print(print_all_probs("shell"))
print(print_all_probs("casa loma"))

→ predictions: grocery (0.48), restaurant (0.28), transportation (0.09), shopping (0.06), entertainment (0.04), health & wellness (0.03), bills (0.02)
→ predictions: shopping (0.25), restaurant (0.22), grocery (0.18), transportation (0.13), health & wellness (0.10), entertainment (0.07), bills (0.04)
→ predictions: restaurant (0.37), entertainment (0.25), grocery (0.14), health & wellness (0.08), shopping (0.07), transportation (0.05), bills (0.04)


In [13]:
# TODO
# 1. register model
# 2. fine tune model even more (now only have the ambiguous terms file; maybe there are more ways to fine tune it)
# 2. create a new file for bayesian updating (using dirichlet prior); likelihood will be the model predictions

In [14]:
# try updating with dirichlet prior

casaloma = model.encode(["casa loma"])
proba_casaloma = clf.predict_proba(casaloma)[0]
result = [f"{df_le.classes_[i]} ({proba_casaloma[i]:.2f})" for i in range(len(proba_casaloma))]
print('likelihood:')
print(f'{result}')

# prior_casaloma = np.array([0.1,1.1,0.1,0.1,0.1,0.1,0.1])
# prior_casaloma = np.array([0.5,1.5,0.5,0.5,0.5,0.5,0.5])
prior_casaloma = np.array([0.3,1.3,0.3,0.3,0.3,0.3,0.3])
prior_casaloma = prior_casaloma / np.sum(prior_casaloma)
print('prior:')
print(f'{np.round(prior_casaloma,3)}')

posterior_casaloma = (prior_casaloma * proba_casaloma) / np.sum(prior_casaloma * proba_casaloma)
print('posterior:')
print([f"{df_le.classes_[i]} ({posterior_casaloma[i]:.2f})" for i in range(len(posterior_casaloma))])


likelihood:
['bills (0.04)', 'entertainment (0.25)', 'grocery (0.14)', 'health & wellness (0.08)', 'restaurant (0.37)', 'shopping (0.07)', 'transportation (0.05)']
prior:
[0.097 0.419 0.097 0.097 0.097 0.097 0.097]
posterior:
['bills (0.02)', 'entertainment (0.59)', 'grocery (0.08)', 'health & wellness (0.05)', 'restaurant (0.20)', 'shopping (0.04)', 'transportation (0.02)']


In [15]:
# try the case: subway

subway = model.encode(["subway"])
proba_subway = clf.predict_proba(subway)[0]
result = [f"{df_le.classes_[i]} ({proba_subway[i]:.3f})" for i in range(len(proba_subway))]
print('likelihood:')
print(f'{result}')

prior_subway = np.array([0.1,0.1,0.1,0.1,0.1,0.1,1.1])
# prior_subway = np.array([0.5,0.5,0.5,0.5,0.5,0.5,1.5])
# prior_subway = np.array([0.3,0.3,0.3,0.3,0.3,0.3,1.3])
prior_subway = prior_subway / np.sum(prior_subway)
print('prior:') 
print(f'{np.round(prior_subway, 3)}')

posterior_subway = (prior_subway * proba_subway) / np.sum(prior_subway * proba_subway)
print('posterior:')
print([f"{df_le.classes_[i]} ({posterior_subway[i]:.3f})" for i in range(len(posterior_subway))])


# second iteration
print("")
print('second iteration')

prior_subway = np.array([0.1,0.1,0.1,0.1,1.1,0.1,1.1])
# prior_subway = np.array([0.3,0.3,0.3,0.3,1.3,0.3,1.3])
prior_subway = prior_subway / np.sum(prior_subway)
print('prior:') 
print(f'{np.round(prior_subway, 3)}')

posterior_subway = (prior_subway * proba_subway) / np.sum(prior_subway * proba_subway)
print('posterior:')
print([f"{df_le.classes_[i]} ({posterior_subway[i]:.3f})" for i in range(len(posterior_subway))])

likelihood:
['bills (0.013)', 'entertainment (0.053)', 'grocery (0.126)', 'health & wellness (0.024)', 'restaurant (0.166)', 'shopping (0.045)', 'transportation (0.573)']
prior:
[0.059 0.059 0.059 0.059 0.059 0.059 0.647]
posterior:
['bills (0.002)', 'entertainment (0.008)', 'grocery (0.019)', 'health & wellness (0.004)', 'restaurant (0.025)', 'shopping (0.007)', 'transportation (0.937)']

second iteration
prior:
[0.037 0.037 0.037 0.037 0.407 0.037 0.407]
posterior:
['bills (0.002)', 'entertainment (0.006)', 'grocery (0.015)', 'health & wellness (0.003)', 'restaurant (0.217)', 'shopping (0.005)', 'transportation (0.752)']
