In [2]:
import pandas as pd 
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report







In [3]:
df = pd.read_csv("data/name_price.csv", header=0) 
# df.head()
texts = df['name'].tolist()
labels = df['category'].tolist()
# print(texts[:5])
# print(labels)

In [4]:
# generate SBERT embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and accurate; other possible pretrained models: https://www.sbert.net/docs/pretrained_models.html
X_embeddings = model.encode(texts)



In [17]:
# encode labels
le = LabelEncoder()
y = le.fit_transform(labels)
# print(y)
print(le.classes_)

['bills' 'entertainment' 'grocery' 'health & wellness' 'restaurant'
 'shopping' 'transportation']


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, stratify=y, test_size=0.2, random_state=42)

# Train the classifier
clf = LogisticRegression(max_iter=1000,class_weight='balanced', n_jobs=-1)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

                   precision    recall  f1-score   support

            bills       1.00      1.00      1.00         3
    entertainment       0.89      0.89      0.89        18
          grocery       0.50      0.17      0.25         6
health & wellness       0.67      0.80      0.73         5
       restaurant       0.87      0.90      0.88        29
         shopping       0.78      0.88      0.82         8
   transportation       0.80      0.89      0.84         9

         accuracy                           0.83        78
        macro avg       0.79      0.79      0.77        78
     weighted avg       0.82      0.83      0.82        78



In [7]:
new_text = ["something"]
new_embedding = model.encode(new_text)
predicted_class = le.inverse_transform(clf.predict(new_embedding))
print(predicted_class[0])  


shopping


In [14]:
# MODIFIED
# soft prediction 
def soft_pred(name, margin=0.2):
    embedding = model.encode([name])
    proba = clf.predict_proba(embedding)[0]
    
    # Get top two indices and their probabilities
    top2_idx = np.argsort(proba)[-2:][::-1]
    p1, p2 = proba[top2_idx[0]], proba[top2_idx[1]]
    c1, c2 = le.classes_[top2_idx[0]], le.classes_[top2_idx[1]]
    
    if p1 - p2 >= margin:
        return f"→ Predicted category: {c1} ({p1:.2f})"
    else:
        return f"→ Ambiguous. Top 2: {c1} ({p1:.2f}), {c2} ({p2:.2f})"


In [15]:

print(soft_pred("Shell"))
print(soft_pred("7 Eleven"))
print(soft_pred("Subway"))
print(soft_pred("metro"))


→ Ambiguous. Top 2: restaurant (0.23), shopping (0.23)
→ Predicted category: grocery (0.50)
→ Predicted category: transportation (0.57)
→ Ambiguous. Top 2: transportation (0.44), grocery (0.30)


In [16]:
def print_top_5_probs(name):
    embedding = model.encode([name])
    proba = clf.predict_proba(embedding)[0]
    
    top5_idx = np.argsort(proba)[-5:][::-1]
    
    result = [f"{le.classes_[i]} ({proba[i]:.2f})" for i in top5_idx]
    return "→ Top 5 predictions: " + ", ".join(result)

print(print_top_5_probs("Subway"))

→ Top 5 predictions: transportation (0.57), restaurant (0.17), grocery (0.11), entertainment (0.06), shopping (0.05)


In [None]:
# fine tune 
