In [1]:
!pip install transformers
!pip install azure-storage-blob
!pip install singlestoredb



In [2]:
import torch
from transformers import BertModel, BertTokenizer
import json
from azure.storage.blob import BlobServiceClient
import singlestoredb as s2
import pandas as pd
from tqdm import tqdm
import concurrent

torch.set_printoptions(threshold=10_000)

CONTAINER = "CONTAINER"
CONNECTION_STRING = "CONNECTION_STRING"
HOST = "SINGLESTORE_HOST"
PASSWORD = "PASSWORD"
PORT = "PORT"
USER = "ADMIN"
DATABASE = "DATABASE"

In [3]:
cocktails_df = pd.read_csv('./cocktails_recipe.csv')
cocktails_df.head()

Unnamed: 0,title,glass,garnish,recipe,ingredients
0,Abacaxi Ricaço,Pineapple shell (frozen) glass,Cut a straw sized hole in the top of the pinea...,Cut the top off a small pineapple and carefull...,"[['1 whole', 'Pineapple (fresh)'], ['9 cl', 'H..."
1,Abbey,Coupe glass,Orange zest twist,SHAKE all ingredients with ice and fine strain...,"[['4.5 cl', 'Rutte Dry Gin'], ['2.25 cl', 'Lil..."
2,A.B.C. Cocktail,Nick & Nora glass,Lemon zest twist & Luxardo Maraschino cherry,TEAR mint and place in shaker. Add other ingre...,"[['7 fresh', 'Mint leaves'], ['3 cl', 'Tawny p..."
3,Absinthe Cocktail,Coupe glass,Mint leaf,SHAKE all ingredients with ice and fine strain...,"[['3 cl', 'La Fée Parisienne absinthe'], ['7.5..."
4,Absinthe Frappé,Old-fashioned glass,Mint sprig,SHAKE all ingredients with ice and fine strain...,"[['4.5 cl', 'La Fée Parisienne absinthe'], ['1..."


In [4]:
cocktails_df['text'] = cocktails_df['garnish']+cocktails_df['recipe']+cocktails_df['ingredients']
del cocktails_df['glass'], cocktails_df['garnish'], cocktails_df['ingredients'], cocktails_df['recipe']
cocktails_df.head()

Unnamed: 0,title,text
0,Abacaxi Ricaço,Cut a straw sized hole in the top of the pinea...
1,Abbey,Orange zest twistSHAKE all ingredients with ic...
2,A.B.C. Cocktail,Lemon zest twist & Luxardo Maraschino cherryTE...
3,Absinthe Cocktail,Mint leafSHAKE all ingredients with ice and fi...
4,Absinthe Frappé,Mint sprigSHAKE all ingredients with ice and f...


In [5]:
def initialize_bert(model_name='bert-base-uncased'):
    """
    Initialize a BERT model and tokenizer.

    Args:
        model_name (str): Name of the BERT model (default: 'bert-base-uncased').

    Returns:
        tuple: A tuple containing the BERT model and tokenizer.
    """
    # Initialize BERT model and tokenizer
    model = BertModel.from_pretrained(model_name, output_hidden_states=True)
    tokenizer = BertTokenizer.from_pretrained(model_name)

    return model, tokenizer

def extract_cocktail_embedding(model, tokenizer, cocktails):
    """
    Extracts BERT-based embeddings for cocktail recipe text using multithreading.

    Args:
        model (BertModel): The BERT model.
        tokenizer (BertTokenizer): The BERT tokenizer.
        cocktails (pd.DataFrame): A DataFrame containing cocktail recipes.

    Returns:
        list: List of extracted cocktail embeddings.
    """

    data_list = list()
    def process_cocktail(cocktail):
        # Extract BERT embeddings for a single cocktail recipe
        text = str(cocktail["text"])
        text = text.replace("\n", "").replace("[", "").replace("]", ",").replace("\'", "")
        marked_text = "[CLS]" + text + "[SEP]"
        tokenize = tokenizer.tokenize(marked_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenize)
        segments_ids = [1] * len(tokenize)
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensor = torch.tensor([segments_ids])

        with torch.no_grad():
            outputs = model(tokens_tensor, segments_tensor)
        bert_hidden_states = outputs[2]
        token_vecs = bert_hidden_states[-2][0]
        sentence_embedding = torch.mean(token_vecs, dim=0)
        embedding = sentence_embedding.numpy().tolist()

        data = {'name': cocktail['title'], 'text': text, 'vector': embedding}
        return data

    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Use concurrent futures for multithreading
        futures = [executor.submit(process_cocktail, cocktail) for _, cocktail in cocktails.iterrows()]

        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            data_list.append(future.result())

    return data_list

def upload_vectors_to_azure(data_list, name="CocktailsVector.json"):
    """
    Uploads cocktail vectors to Azure Blob Storage.

    Args:
        data_list (list): List of cocktail data with embeddings.
        container_name (str): Name of the Azure Blob container for storage.
        connection_string (str): Connection string for Azure Blob Storage.

    Returns:
        list: List of uploaded cocktail data.
    """
    # Create a service client and container client for Azure Blob Storage
    blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)
    container_client = blob_service_client.get_container_client(CONTAINER)

    # Upload data to Azure Blob Storage
    for i, data_dict in tqdm(enumerate(data_list), total=len(data_list)):
        blob_name = name + f"{i}.json"
        data_json = json.dumps(data_dict)
        blob_client = container_client.get_blob_client(blob_name)
        blob_client.upload_blob(data_json, overwrite=True)

    print(f"Uploaded all vectors to Azure Blob Storage")

    return data_list

def retrieve_top_cocktail_matches(name, embedding):
    """
    Retrieves the top cocktail matches based on a given embedding using SingleStore.

    Args:
        name (str): Name of the cocktail.
        embedding (list): BERT-based embedding to compare against stored cocktail embeddings.

    Returns:
        list: List of top cocktail matches.
    """
    # Establish a connection to SingleStore database

    conn = s2.connect(host=HOST,
                      port=PORT, user=USER, password=PASSWORD, database=DATABASE)

    # Select top cocktail matches based on dot product similarity
    conn.autocommit(True)
    with conn.cursor() as cur:
        query = f'SELECT DISTINCT name, text, dot_product(vector, JSON_ARRAY_PACK("'+str(embedding)+'")) as score FROM CocktailVectors WHERE name != "'+str(name)+'" ORDER BY score DESC limit 10;'
        cur.execute(query)
        top_matches = cur.fetchall()
        return top_matches


In [6]:
model, tokenizer = initialize_bert()

In [8]:
vectors = extract_cocktail_embedding(model, tokenizer, cocktails_df)
print("Generated Vector for "+str(len(vectors))+" data points.")
data = upload_vectors_to_azure(vectors)

Generated Vector for 6956 data points.


100%|██████████| 6956/6956 [08:56<00:00, 12.95it/s]

Uploaded all vectors to Azure Blob Storage





In [9]:
result_df = cocktails_df[cocktails_df['title'].str.contains('Blue Margarita', case=False, na=False)]
result_df

Unnamed: 0,title,text
247,Blue Margarita,Lime sliceBLEND all ingredients with 6oz scoop...


In [10]:
embedding = extract_cocktail_embedding(model, tokenizer, result_df)
result = retrieve_top_cocktail_matches(result_df['title'].values[0], embedding[0]['vector'])

100%|██████████| 1/1 [00:00<00:00,  4.57it/s]


In [11]:
pretty_json = json.dumps(result, indent=4)
print(pretty_json)


[
    [
        "The Med",
        "Basil sprigMUDDLE basil in base of shaker. Add next 3 ingredients, SHAKE with ice and fine strain into ice-filled glass. TOP with soda.7 fresh, Basil leaves,, 5 cl, Skinos Mastiha,, 2 cl, Lemon juice (freshly squeezed),, 0.5 cl, Sugar syrup (65.0\u00b0brix, 2 sugar to 1 water rich syrup),, Top up with, Thomas Henry Soda Water,,",
        209.283935546875
    ],
    [
        "Summer Orchard",
        "Juniper berriesMUDDLE apple in base of shaker. Add other ingredients, SHAKE with ice and fine stain into chilled glass. TOP with sparking wine.4 slice, Red apple,, 4.5 cl, Caorunn Gin,, 1 cl, Red wine vinegar,, 1 cl, Watermelon sugar syrup,, Top up with, Brut Champagne,,",
        208.2766876220703
    ],
    [
        "Whiskey smash",
        "Mint leavesPut ingredients in glass and muddle.Fill glass with ice6 unit, Mint leaves,, 4 wedge, Lemon (fresh),, 1.5 cl, Sugar syrup (65.0\u00b0brix, 2 sugar to 1 water rich syrup),, 6 cl, Bourbon whiskey,,",
   