### Library

In [None]:
# !pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu

In [6]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [12]:
import torch
import numpy as np
import seaborn as sns
import pandas as pd
import scipy

from transformers import BertModel, BertTokenizer
from sentence_transformers import SentenceTransformer

### Dataset

In [47]:
tables = ["inventory", "rental", "payment", "staff", "actor", "film_actor", "language", "film", "film category", "category", "customer", "address", "city", "country", "store"]
sentences = ["Which customers have rented films starring Tom Hanks?", 
             "What are the total payments made by John Doe?", 
             "Which films are available in English and what are their categories?",
             "Which staff members are associated with Store 1 and what are their roles?",
             "What are the most rented film categories in New York City?",
             "Which actors have appeared in the most films?",
             "How many different languages are spoken in the films?",
             "What is the total inventory count across all stores?",
             "Which city has the highest number of film rentals?",
             "What are the popular film categories in each country?"
            ]

### Embedding Model

In [None]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def get_sentence_embedding(sentence):
    encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
    attention_mask = encoded_input['attention_mask']   # to indicate which tokens are valid and which are padding
    
    # Get the model output (without the specific classification head)
    with torch.no_grad():
        output = model(**encoded_input)

    token_embeddings = output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

    # mean pooling operation, considering the BERT input_mask and padding
    sentence_embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    return sentence_embedding.flatten().tolist()

### Helper Function

In [18]:
def cosine_similarity_matrix(features):
    norms = np.linalg.norm(features, axis=1, keepdims=True)
    normalized_features = features / norms
    similarity_matrix = np.inner(normalized_features, normalized_features)
    rounded_similarity_matrix = np.round(similarity_matrix, 4)
    return rounded_similarity_matrix

In [33]:
# Helper function to plot similarity matrix
def plot_similarity(labels, features, rotation):
    sim = cosine_similarity_matrix(features)
    sns.set_theme(font_scale=1.2)
    g = sns.heatmap(sim, xticklabels=labels, yticklabels=labels, vmin=0, vmax=1, cmap="YlOrRd")
    g.set_xticklabels(labels, rotation=rotation)
    g.set_title("Semantic Textual Similarity")
    return g

### Similarity Function

In [48]:
def sim_two_sentences_word_embedding(s1, s2):
    emb1 = get_sentence_embedding(s1)
    emb2 = get_sentence_embedding(s2)
    sim = cosine_similarity_matrix(np.vstack([emb1, emb2]))
    return sim[0,1]


model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
def sim_two_sentences_sentence_embedding(s1, s2, model=model):
    emb1 = list(model.encode(s1))
    emb2 = list(model.encode(s2))
    sim = cosine_similarity_matrix(np.vstack([emb1, emb2]))
    return sim[0,1]

In [52]:
sentence = sentences[8]
result = {table: sim_two_sentences_sentence_embedding(sentence, table) for table in tables}
df = pd.DataFrame(result.items(), columns=['table', 'similarity'])
df['sentence'] = sentence
df = df.sort_values("similarity", ascending=False)

# To show all words in a pandas row, you can use the following code:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

df

Unnamed: 0,table,similarity,sentence
1,rental,0.5648,Which city has the highest number of film rentals?
8,film category,0.4956,Which city has the highest number of film rentals?
12,city,0.4625,Which city has the highest number of film rentals?
7,film,0.4057,Which city has the highest number of film rentals?
5,film_actor,0.3808,Which city has the highest number of film rentals?
4,actor,0.2181,Which city has the highest number of film rentals?
13,country,0.2098,Which city has the highest number of film rentals?
14,store,0.2065,Which city has the highest number of film rentals?
11,address,0.1355,Which city has the highest number of film rentals?
6,language,0.1001,Which city has the highest number of film rentals?
