Zuerst werden die benötigten Pakete installiert. Zusätzlich wird der Seed auf 0 Festgelegt um reproduzierbare Ergebnisse zu erhalten.

In [7]:
# für Deep Learning
import torch

# Für reproduzierbare Ergebnisse, den Seed festlegen
torch.manual_seed(0)

import tensorboard

# für Matrizen
import numpy as np
# Für Grafiken und Abbildungen
import matplotlib.pyplot as plt
# Zum Laden und vorverarbeiten von Dateien
from torch.utils.data import Dataset, DataLoader
#Für Tabellen
import pandas as pd

# Ignore warnings
import warnings

# Für Ladebalken
from tqdm.notebook import tqdm

# Für Transformer Modelle
from transformers import (
    BertTokenizerFast,
    BertForMaskedLM,
    pipeline,
)

warnings.filterwarnings("ignore")


In [8]:
# Installieren nötiger Pakete
import sys
!{sys.executable} -m pip install sentence-transformers
!{sys.executable} -m pip install unidecode





[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from sentence_transformers import SentenceTransformer

In [9]:
# Für Lemmatisierung
from nltk.stem import WordNetLemmatizer
# Für reguläre Ausdrücke
import re
# Für einheitliche Character Representation
import unidecode
# für Text
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\phil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Die nächsten Schritte beinhalten das Laden der Daten und das on-the-fly Preprocessing. Die folgende Funktion aus [1] übertragen um Mengenangaben und Sonderzeichen zu entfernen. Zusätzlich werden die Zutaten Lemmatisiert.

In [25]:
lemmatizer = WordNetLemmatizer()
def preprocess(ingredients):
    ingredients = ' '.join(ingredients)
    ingredients = ingredients.lower() #Convert to lowercase
    ingredients = re.sub('[,\.!?:()"]', '',ingredients) # remove punctuation marks 
    ingredients = re.sub('[^a-zA-Z"]',' ',ingredients) # remove all strings that contain a non-letter
    ingredients = ingredients.replace('-', ' ')
    words = []
    for word in ingredients.split():
        word = re.sub("[0-9]"," ",word) #removing numbers
        word = re.sub((r'\b(oz|ounc|ounce|pound|lb|inch|inches|kg|to)\b'), ' ', word) # Removing Units
        if len(word) <= 2: continue
        word = unidecode.unidecode(word)
        word = lemmatizer.lemmatize(word)
        if len(word) > 0: words.append(word)
    return words

In [53]:
# Erkennen, ob eine GPU verfügbar ist um den Embeddingvorgang zu beschleunigen
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# kopiert aus der sBERT Dokumentation https://www.sbert.net/examples/applications/computing-embeddings/README.html
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

class IngredientsEmbeddedDataset(Dataset):
    """Face Landmarks dataset."""
    
    def __init__(self, csv_file, embedding_mode='mean', bert_model = None, transform=None):
        """
        Argumente:
            csv_file (string): Pfad zu der json datei welche die Daten enthält.
            embedding_mode (string): Wie die Zutatenliste zu einem Vektor zusammengeführt wird.
                                    mean für elementweises Mitteln.
                                    combine für Kombinieren zu einem Satz
            bert_model (string): encoder model.
                                Entweder CookBert (https://github.com/paschistrobel/CookBERT)
                                oder None für bert-base-nli-mean-tokens
            transform (callable, optional): Eine mögliche transformation
        """
        self.data_frame = pd.read_json(csv_file)
        
        classes = self.data_frame['cuisine'].unique()
        
        self.one_hot_dict = {item : index for index,item in enumerate(classes)}
        self.class_counts = self.data_frame.cuisine.value_counts()
        
        # formula for class weights balanced
        self.class_weights = {len(self.data_frame) / (len(classes)*self.class_counts[item]) for index,item in enumerate(classes)}
        
        self.cached_embeddings = [None] * len(self.data_frame)
        
        self.transform = transform
        self.embedding_mode = embedding_mode
        
        self.encoder = bert_model
        
        if self.encoder == 'CookBert': #https://github.com/paschistrobel/CookBERT            
            self.CookBERT_tokenizer = BertTokenizerFast.from_pretrained("CookBERT-checkpoint", use_fast=True)
            self.CookBERT = BertForMaskedLM.from_pretrained("CookBERT-checkpoint")._modules['bert']            
                        
            self.CookBERT.to(device)#use gpu for big speedup

            self.model = pipeline(task="feature-extraction", model=self.CookBERT, tokenizer=self.CookBERT_tokenizer)
        elif self.encoder is None:
            self.model = SentenceTransformer('bert-base-nli-mean-tokens')
        else:
            print(f"could not find requested model {bert_model}. falling back to default bert-base-nli-mean-tokens")
            self.model = SentenceTransformer('bert-base-nli-mean-tokens')
            self.encoder = None
    
    def encode_sentence(self, sentence):
        if self.encoder == 'CookBert': #https://github.com/paschistrobel/CookBERT
            # Tokenize the sentence.
            encoded_inputs = self.CookBERT_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)                      
            self.encoded_inputs = encoded_inputs

            embedding = self.CookBERT(**encoded_inputs)

            mean_pooled_embedding = mean_pooling(embedding, self.encoded_inputs['attention_mask'])                

            return mean_pooled_embedding.cpu().detach()
        elif self.encoder is None:
            return self.model.encode(sentence, device=device)

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        ingredients = self.data_frame.iloc[idx].ingredients
        
        cuisine = self.data_frame.iloc[idx].cuisine
        #model.encode(sentences)
        embedding = None
        
        if self.cached_embeddings[idx] is not None:
            embedding = self.cached_embeddings[idx]
            #print(f"cache hit for {idx}")
        elif self.embedding_mode is 'mean':
            ingredients = preprocess(ingredients)
            embeddings = self.encode_sentence(ingredients)
            embedding = embeddings.mean(axis = 0)
            self.cached_embeddings[idx] = embedding
        elif self.embedding_mode is 'combine':
            ingredients = preprocess(ingredients)
            embedding = self.encode_sentence('The kitchen uses the following ingredients' + ' '.join(ingredients)).squeeze()
            #embedding = embeddings.mean(axis = 0)
            self.cached_embeddings[idx] = embedding
            #print(f"computed embedding for {idx}")
        #sample = {'embedding': embedding, 'ingredients': ingredients, 'cuisine': cuisine, 'target' : self.one_hot_dict[cuisine]}
        sample = {'embedding': embedding,'target' : self.one_hot_dict[cuisine]}

        if self.transform:
            sample = self.transform(sample)

        return sample

In [54]:
experiment_index = 0 # in diesem Fall wird das modell bert-base-nli-mean-tokens verwendet, und elementweises mitteln, um die einzelnen embeddings zu vereinen
#experiment_index = 1 # in diesem Fall wird das modell bert-base-nli-mean-tokens verwendet, und aus den Zutaten wird ein Satz gebildet welcher embedded wird
#experiment_index = 2 # in diesem Fall wird das modell CookBERT verwendet, und elementweises mitteln, um die einzelnen embeddings zu vereinen
#experiment_index = 4 # in diesem Fall wird das modell CookBERT verwendet, und aus den Zutaten wird ein Satz gebildet welcher embedded wird

In [55]:
if experiment_index == 0:
    experiment = "sBERTMean"
    full_training_set = IngredientsEmbeddedDataset("cleaned_common_words.json", embedding_mode="mean")
if experiment_index == 1:
    experiment = "sBERTCombine"
    full_training_set = IngredientsEmbeddedDataset("cleaned_common_words.json", embedding_mode="combine")
if experiment_index == 2:
    experiment = "CookBERTMean"
    full_training_set = IngredientsEmbeddedDataset("cleaned_common_words.json", embedding_mode="mean", bert_model="CookBert")
if experiment_index == 3:
    experiment = "CookBERTCombine"
    full_training_set = IngredientsEmbeddedDataset("cleaned_common_words.json", embedding_mode="combine", bert_model="CookBert")

In [56]:

# Create data loaders for our datasets; shuffle for training, not for validation
validation_percentage = 5.0 # Benutze 5% der daten zur Validierung

validation_count = int(len(full_training_set) * (validation_percentage / 100))

# Aufteilen des Datasets in einen teil für Training und einen für Validierung.
train_set, val_set = torch.utils.data.random_split(full_training_set, [len(full_training_set)-validation_count, validation_count])

# Die Dataloader helfen die Daten in Batches zu laden, und zu mischen.
# Für Training, die Daten mischen
training_subset_loader = torch.utils.data.DataLoader(train_set, batch_size=512, shuffle=True)
# Für Validierung, die Reihenfolge beibehalten
validation_subset_loader = torch.utils.data.DataLoader(val_set, batch_size=512, shuffle=False)


In [57]:
#full_training_set.CookBERT.__dict__

In [58]:
print(full_training_set.one_hot_dict)

printed_embeddings = 1
for i, sample in enumerate(tqdm(full_training_set)):

    #print(i, sample['cuisine'],sample['ingredients'])
    print(sample['embedding'].shape, sample['target'])
    print(sample['embedding'])
    if i == printed_embeddings -1:        
        break

{'greek': 0, 'southern_us': 1, 'filipino': 2, 'indian': 3, 'jamaican': 4, 'spanish': 5, 'italian': 6, 'mexican': 7, 'chinese': 8, 'thai': 9, 'vietnamese': 10, 'cajun_creole': 11, 'french': 12, 'japanese': 13, 'irish': 14, 'korean': 15, 'moroccan': 16, 'british': 17, 'russian': 18, 'brazilian': 19}


  0%|          | 0/36825 [00:00<?, ?it/s]

(768,) 0
[-1.92879960e-01  8.42401385e-02  1.47018790e+00 -2.31107816e-01
  2.78237134e-01  4.92624849e-01 -1.23385955e-02  6.94262564e-01
  8.91929865e-02 -3.23365897e-01 -5.68237424e-01  1.40241235e-01
  1.89152732e-01  7.07353234e-01  8.79593432e-01 -4.57285978e-02
 -7.66454041e-01 -4.66185287e-02  3.11386675e-01 -3.99705410e-01
  9.36670750e-02  3.59431326e-01 -3.25278580e-01 -9.04336512e-01
 -1.12629190e-01 -5.35671175e-01  1.48645312e-01 -1.10582328e+00
 -1.59317121e-01 -6.19924366e-02  1.85565297e-02  2.04996780e-01
  8.43308032e-01 -2.60736234e-02 -3.00469011e-01  3.31688821e-01
 -3.87700856e-01  1.11413002e-01 -5.80832437e-02  1.85916737e-01
  1.23377883e+00  2.77279913e-01  7.52449870e-01  3.74130309e-01
 -1.95533633e-01 -5.94785772e-02 -7.85126016e-02  5.96378803e-01
 -4.89116341e-01 -7.67008662e-01 -8.14348310e-02 -8.73381436e-01
  9.46973145e-01  4.09914017e-01 -5.18964112e-01  1.49229646e-01
  3.51720750e-01 -1.27169266e-01  6.42876267e-01  5.48004091e-01
  8.40419680e-02

In [59]:

import torch.nn as nn
import torch.nn.functional as F


class NetDeeper2(nn.Module):

    def __init__(self):
        super(NetDeeper2, self).__init__()
        self.fc1 = nn.Linear(768, 1024)
        self.dropout1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(1024, 800)
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(800, 512)
        self.dropout3 = nn.Dropout(0.2)
        self.fc4 = nn.Linear(512, 256)
        self.dropout4 = nn.Dropout(0.2)
        self.fc5 = nn.Linear(256, 20)
        self.log_soft_max = torch.nn.LogSoftmax()    

    def forward(self, x):
        x = self.dropout1(x)
        x = F.leaky_relu(self.fc1(x)) #leaky relu because of negative input values
        x = self.dropout2(x)
        x = F.leaky_relu(self.fc2(x))
        x = self.dropout3(x)
        x = F.leaky_relu(self.fc3(x))
        x = self.dropout4(x)
        x = F.leaky_relu(self.fc4(x))
        x = self.fc5(x)        
        return self.log_soft_max(x)



model = NetDeeper2()


print(model)

NetDeeper2(
  (fc1): Linear(in_features=768, out_features=1024, bias=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=1024, out_features=800, bias=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=800, out_features=512, bias=True)
  (dropout3): Dropout(p=0.2, inplace=False)
  (fc4): Linear(in_features=512, out_features=256, bias=True)
  (dropout4): Dropout(p=0.2, inplace=False)
  (fc5): Linear(in_features=256, out_features=20, bias=True)
  (log_soft_max): LogSoftmax(dim=None)
)


In [60]:
loss_fn = torch.nn.NLLLoss(weight=torch.FloatTensor(list(full_training_set.class_weights)))

In [61]:
optimizer = torch.optim.Adam(model.parameters())

In [62]:
def train_one_epoch(epoch_index, tb_writer, epochs):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_subset_loader) instead of
    # iter(training_subset_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    loop = tqdm(training_subset_loader,desc=f"Epoch [{epoch_index}/{epochs}]")
    for i, data in enumerate(loop):                
        
        # Every data instance is an input + label pair
        inputs, labels = data['embedding'],data['target']

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)        
        
        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()
        loss_item = loss.item()
        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_subset_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

        loop.set_description(f"Epoch [{epoch_index}/{epochs}]")
        loop.set_postfix(loss=loss_item)

    return last_loss

In [63]:
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
import os

os.makedirs("Models")

# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter(f'runs/{experiment}_{timestamp}')
epoch_number = 0

EPOCHS = 100

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number, writer, EPOCHS)


    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()
    
    correct = 0
    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(validation_subset_loader):
            vinputs, vlabels = vdata['embedding'],vdata['target']
            voutputs = model(vinputs)
            vloss = loss_fn(voutputs, vlabels)
            running_vloss += vloss
            correct += (np.argmax(voutputs, axis=1) == vlabels).sum()  #enable if batching works

    
    accuracy = correct / (len(validation_subset_loader)*validation_subset_loader.batch_size)
    print("Accuracy = {}".format(accuracy))
    
    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)

    #writer.add_scalars('Accuracy',
    #                { 'Training' : 0, 'Validation' : accuracy },
    #                epoch_number + 1)
    
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = f'Models/{experiment}_{timestamp}_{epoch_number}'
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:


Epoch [0/100]:   0%|          | 0/69 [00:00<?, ?it/s]


KeyboardInterrupt

