In [11]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import pandas as pd
from unidecode import unidecode
import re
import numpy as np

In [12]:
# Loading the model
model = SentenceTransformer('all-MiniLM-L6-v2')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [13]:
# Loading the data
TRANSACTIONS_DATA_PATH = '../../data/transactions.csv'
df_transactions = pd.read_csv(TRANSACTIONS_DATA_PATH)

In [14]:
df_transactions['description']

0     From Liam J. Johnson for Deel, ref 4oJnVOMRLZf...
1     From Olivia Roland Smith for Deel, ref mhH2aFL...
2     From 杨陈 for Deel, ref 0Ckil9BX0zXMACC//1474884...
3     Transfer from Emma Brown for Deel, ref kKx5IWy...
4     From Oliver Talor for Deel, ref zMpiWFssuC1sAC...
                            ...                        
95    From Audrey Peterson for Deel, ref UWSpnVhFXGH...
96    Transfer from  奕辰 for Deel, ref 7bQYUZ1Bble5AC...
97    From Elena## BUTLET for Deel, ref oheOEVx,wfB1...
98    From Christian Griffin for Deel, ref NAJRqF8Wt...
99    From Grace Henderson for Deel, ref odjYain0Nn6...
Name: description, Length: 100, dtype: object

In [15]:
def clean_fields(s):
    s = unidecode(s).casefold()
    s = s.translate(s.maketrans({k: ' ' for k in '!"#$%&\'()*+,-.:;<=>?@[]^_`{|}~'}))
    return re.sub('\s\s+', ' ', s)

In [16]:
# We can't remove all punctuation because of the reference codes.
df_transactions['description'] = df_transactions['description'].apply(clean_fields)

In [17]:
# Transforming IDs and descriptions into list so we can build a dataframe with the embeddings
id_list = df_transactions['id'].to_list()
descriptions = df_transactions['description'].to_list()

In [18]:
# Transforms the descriptions into embeddings
embeddings = model.encode(descriptions)

384

In [19]:
# Creating the new dataframe with the embeddings
df_transactions = pd.DataFrame(columns = ['id', 'embeddings'], data = zip(id_list, embeddings))

In [38]:
# Function to calculate the cosine similarity between two given sentences (one should be the user input, the other one should be the precomputed
# embeddings of the descriptions)
def calculate_similarity(row_embedding, query_embedding):
    return cos_sim(row_embedding, query_embedding).item()

In [39]:
# Searches for the closest sentence to the user input in the precomputed embeddings
def search_closest_sentence(s, df_transactions):
    query_embedding = model.encode(clean_fields(s))
    df_transactions['similarity'] = df_transactions['embeddings'].apply(calculate_similarity, query_embedding=query_embedding)
    return df_transactions.sort_values(by='similarity', ascending=False, na_position='last')

In [49]:
# Example of testing the algorithm
results = search_closest_sentence("From Mia Lewis", df_transactions)

In [50]:
results_filtered = results[results['similarity'] > 0.51]

In [51]:
results_filtered.head(10)

Unnamed: 0,id,embeddings,similarity
11,J7os8g9d,"[-0.17402273, 0.058560427, -0.17690979, 0.0154...",0.59568


In [56]:
#Getting similarity for a specific record
records = results_filtered.to_records()
records[0].embeddings

array([-1.74022734e-01,  5.85604273e-02, -1.76909789e-01,  1.54276202e-02,
        3.77101153e-02,  7.61425029e-03,  1.23299263e-01,  7.19177052e-02,
        6.91393241e-02, -1.67336259e-02,  4.16594557e-02, -3.85585167e-02,
        5.52373100e-03, -2.40408797e-02, -1.03121206e-01,  4.77273799e-02,
       -8.90637115e-02,  8.93453788e-03, -8.30889419e-02,  2.13276856e-02,
       -1.86541639e-02,  5.91308884e-02, -5.72981350e-02,  2.79016001e-03,
       -2.57867090e-02, -6.57052994e-02, -4.96601649e-02,  7.36713558e-02,
       -1.60218365e-02, -1.17007591e-01,  1.97197683e-02,  5.53456508e-02,
        1.13011652e-03,  1.43220779e-02,  8.53173360e-02,  5.39516024e-02,
       -3.00220475e-02, -7.46631157e-03, -1.12330113e-02,  2.76866481e-02,
        4.70100455e-02, -7.12068602e-02, -7.92683195e-03, -2.72459853e-02,
       -3.47832739e-02,  6.15384020e-02, -2.27282438e-02,  2.98125315e-02,
        3.54581065e-02,  8.32348503e-03, -8.85481760e-02,  6.84600770e-02,
       -2.08087787e-02,  