# FeedForward Neural Network

## Imports

In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import tensorflow as tf
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
from typing import List, Tuple, Dict
import nltk
import os
import optuna
import sys
import pickle
import random
import ast
import math

print(tf.config.list_physical_devices('GPU'))

sys.path.append(os.path.abspath("/kaggle/input/recipebatchfeedforwardgenerator/tensorflow2/default/1"))
sys.path.append(os.path.abspath("/kaggle/input/optimizedffnnwithcustomloss/keras/default/1/optimized_feedforward_nn.py"))

#from feedforward_nn import BatchFeedForwardNN

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


## ⭐ Base Preprocessing

In [48]:
data = pd.read_csv("/kaggle/input/preprocessed-recipe/preprocessed_recipe.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,id,name,minutes,n_steps,description,n_ingredients,steps_string_standardize,ingredients_text,tags_text,cuisine,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,0,137739,arriba baked squash mexican,55,11,autumn is my favorite time of year to cook! th...,7,make a choic and proceed with recip depend on ...,"['winter squash', 'mexican seasoning', 'mixed ...","['60-minutes-or-less', 'time-to-make', 'course...",North America – United States,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,1,31490,breakfast pizza,30,9,this recipe calls for the crust to be prebaked...,6,preheat oven to 103.33 celsius °c press dough ...,"['prepared pizza crust', 'sausage patty', 'egg...","['30-minutes-or-less', 'time-to-make', 'course...",North America – United States,173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,4,44061,amish tomato ketchup canning,190,5,my dh's amish mother raised him on this recipe...,8,"mix all ingredients& boil for 2 30.0 minute , ...","['tomato juice', 'apple cider vinegar', 'sugar...","['weeknight', 'time-to-make', 'course', 'main-...",North America – United States,352.9,1.0,337.0,23.0,3.0,0.0,28.0
3,5,25274,marinated olive,15,4,my italian mil was thoroughly impressed by my ...,9,toast the fennel seed and lightli crush them p...,"['fennel seeds', 'green olives', 'ripe olives'...","['15-minutes-or-less', 'time-to-make', 'course...",North America – United States,380.7,53.0,7.0,24.0,6.0,24.0,6.0
4,6,67888,barbecued rib,120,10,this recipe is posted by request and was origi...,22,in a medium saucepan combin all the ingredi fo...,"['pork spareribs', 'soy sauce', 'fresh garlic'...","['weeknight', 'time-to-make', 'course', 'main-...",North America – United States,1109.5,83.0,378.0,275.0,96.0,86.0,36.0


## Simple FFNN

This is a simple feed forward neural network that learns to predict the next token using a fixed context

#### Training Process (context of 3)

Input: `<s> ing1 ing2 <STEPS> w1 w2 </s>`

| Sample | Input (`x`)                 | Target token (`y`) |
|------|-----------------------------|--------------|
| x1   | `<s> ing1 ing2`             | `<STEPS>`    |
| x2   | `ing1 ing2 <STEPS>`         | `w1`         |
| x3   | `ing2 <STEPS> w1`           | `w2`         |
| x4   | `<STEPS> w1 w2`             | `</s>`       |

Unknown words are encoded as `<UNKNOWN>`

#### Model Architecture

- One Hot encoding (`vocab_size`)
- Embdedding layer (`vocab_size`, `embedding_dim`)
- Dense layer (`embedding_dim`, `hidden_dim`)
- Dense layer (`hidden_dim`, `vocab_size`)
- Softmax layer (`vocab_size`)

#### Preprocessing

In [49]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from typing import List, Dict, Set, Tuple
import string
import pickle
import warnings
import ast
from collections import Counter, defaultdict
from tensorflow.keras.preprocessing import text

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

class KerasTokenizerGeneratorPreprocessing:
    def __init__(self,
                 ingredients_col='ingredients_text',
                 steps_col='steps_string_standardize',
                 drop_uncommon=None,
                 max_num_words=10000):
        self.ingredients_col = ingredients_col
        self.steps_col = steps_col
        self.stemmer = LancasterStemmer()
        self.drop_uncommon = drop_uncommon
        self.max_num_words = max_num_words
        self.tokenizer = text.Tokenizer(num_words=self.max_num_words)

    def _extract_ingredients(self, row) -> List[str]:
        if pd.isna(row[self.ingredients_col]):
            return []

        ingredients = ast.literal_eval(row[self.ingredients_col])
        return [self.stemmer.stem(ing) for ing in ingredients if ing]

    def _extract_steps(self, row) -> List[str]:
        if pd.isna(row[self.steps_col]):
            return []
    
        steps = row[self.steps_col].split(',')
        steps = [step.strip().lower() for step in steps]
    
        cleaned_stems = []
        for step in steps:
            cleaned = re.sub(r'[^a-zA-Z0-9 .]', '', step)
    
            if not cleaned or cleaned.isdigit():
                continue
    
            stemmed = self.stemmer.stem(cleaned)
            cleaned_stems.append(stemmed)
    
        return cleaned_stems
    
    def _collect_corpus(self, X: pd.DataFrame) -> List[str]:
        corpus = []
        for _, row in X.iterrows():
            ingredients = self._extract_ingredients(row)
            steps = self._extract_steps(row)
            
            if ingredients:
                corpus.extend(ingredients)
            if steps:
                corpus.extend(steps)
        return corpus

    def tokenize(self, text: str) -> List[str]:
        return text.split()
    
    def tokenize_list(self, text_list: List[str]) -> List[List[str]]:
        flattened = []
        for text in text_list:
            flattened.extend(self.tokenize(text))
        return flattened

    def fit(self, X: pd.DataFrame, y=None):
        missing_cols = [col for col in [self.ingredients_col, self.steps_col] if col not in X.columns]
        if missing_cols:
            raise ValueError(f"Columns {missing_cols} not found in the DataFrame")
        
        # Collect all text to fit the tokenizer
        all_steps = []
        for _, row in X.iterrows():
            steps = self._extract_steps(row)
            all_steps.extend(steps)
        
        # Fit the tokenizer on the corpus
        self.tokenizer.fit_on_texts(all_steps)
        return self

    def transform(self, X: pd.DataFrame) -> Tuple[List[List[str]], List[List[str]]]:
        X = X.dropna(subset=[self.ingredients_col, self.steps_col])
        ingredients_lists = []
        steps_lists = []
        ingredients_counter = defaultdict(int)

        for _, row in X.iterrows():
            ingredients = self._extract_ingredients(row)
            steps = self._extract_steps(row)
            
            for ing in ingredients:
                ingredients_counter[ing] += 1
                
            ingredients_lists.append(ingredients)
            steps_lists.append(steps)

        if self.drop_uncommon is not None:
            uncommon_ingredients = {
                ing for ing, count in ingredients_counter.items()
                if count < self.drop_uncommon
            }
    
            filtered_ingredients_lists = []
            filtered_steps_lists = []
            for ingredients, steps in zip(ingredients_lists, steps_lists):
                if not any(ing in uncommon_ingredients for ing in ingredients):
                    filtered_ingredients_lists.append(ingredients)
                    filtered_steps_lists.append(steps)
    
            ingredients_lists = filtered_ingredients_lists
            steps_lists = filtered_steps_lists

        tokenized_steps_lists = []
        for steps in steps_lists:
            all_words = []
            for step in steps:
                all_words.extend(step.split())
            tokenized_steps_lists.append(all_words)

        return ingredients_lists, tokenized_steps_lists

    def fit_transform(self, X: pd.DataFrame, y=None) -> Tuple[List[List[str]], List[List[str]]]:
        return self.fit(X).transform(X)


In [50]:
keras_generator_preprocessing = KerasTokenizerGeneratorPreprocessing()
ingredients, steps = keras_generator_preprocessing.fit_transform(data)

In [7]:
ingredients[0], steps[0]

(['winter squash',
  'mexican seasoning',
  'mixed spice',
  'honey',
  'but',
  'olive oil',
  'salt'],
 ['make',
  'a',
  'choic',
  'and',
  'proceed',
  'with',
  'recip',
  'depend',
  'on',
  'size',
  'of',
  'squash',
  'cut',
  'into',
  'half',
  'or',
  'fourth',
  'remov',
  'seed',
  'for',
  'spici',
  'squash',
  'drizzl',
  'oliv',
  'oil',
  'or',
  'melt',
  'butter',
  'over',
  'each',
  'cut',
  'squash',
  'piec',
  'season',
  'with',
  'mexican',
  'season',
  'mix',
  'ii',
  'for',
  'sweet',
  'squash',
  'drizzl',
  'melt',
  'honey',
  'but',
  'grate',
  'piloncillo',
  'over',
  'each',
  'cut',
  'squash',
  'piec',
  'season',
  'with',
  'sweet',
  'mexican',
  'spice',
  'mix',
  'bake',
  'at',
  '176.67',
  'celsius',
  'again',
  'depend',
  'on',
  'size',
  'for',
  '40.0',
  'minute',
  'up',
  'to',
  'an',
  'ho',
  'until',
  'a',
  'fork',
  'can',
  'easili',
  'pierc',
  'the',
  'skin',
  'be',
  'care',
  'not',
  'to',
  'burn',
  'the'

## Optimized FNN

#### Training

In [51]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from typing import List, Tuple, Dict
from collections import Counter, defaultdict
import time

class OptimizedBatchFeedForwardNN:
    def __init__(self):
        self.model = None
        self.vocab = None
        self.token_to_id = None
        self.id_to_token = None
        self.context = None
        self.batch_size = 32

    def _build_vocabulary(self, ingredients: List[List[str]], steps: List[List[str]]) -> None:
        print("Building vocabulary...")
        all_tokens = [token for lst in ingredients + steps for token in lst] + ['<UNKNOWN>', '<s>', '</s>', "<STEPS>"]
        counter = Counter(all_tokens)
        self.vocab = [token for token, _ in sorted(counter.items(), key=lambda x: (-x[1], x[0]))]
        self.token_to_id = {token: idx for idx, token in enumerate(self.vocab)}
        self.id_to_token = {idx: token for idx, token in enumerate(self.vocab)}

    def _prepare_train(self, ingredients: List[List[str]], steps: List[List[str]]) -> Tuple[np.ndarray, np.ndarray]:        
        if self.vocab is None:
            self._build_vocabulary(ingredients, steps)

        context_targets = defaultdict(Counter)
        
        for i in range(len(ingredients)):
            ingredient_ids = [self.token_to_id["<s>"]] + \
                             [self.token_to_id.get(token, self.token_to_id['<UNKNOWN>']) for token in ingredients[i]] + \
                             [self.token_to_id['<STEPS>']]
            step_ids = [self.token_to_id.get(step_token, self.token_to_id['<UNKNOWN>']) for step_token in steps[i]] + \
                       [self.token_to_id["</s>"]]

            tokens_ids = ingredient_ids + step_ids

            for k in range(len(tokens_ids) - self.context):
                context_window = tuple(tokens_ids[k:k+self.context])
                context_targets[context_window][tokens_ids[k+self.context]] += 1

        print(f"Found {len(context_targets)} unique context-grams")

        X_data = []
        y_data = []
        sample_weights = []

        print("Building sample weights using context gram...")
        for context_gram, target_counts in context_targets.items():
            total_count = sum(target_counts.values())
            for target_id, count in target_counts.items():
                X_data.append(list(context_gram))
                y_data.append(target_id)
                sample_weights.append(count / total_count)

        return np.array(X_data, dtype=np.int32), np.array(y_data, dtype=np.int32), np.array(sample_weights, dtype=np.float32)
        
    def fit(self, ingredients: List[List[str]], steps: List[List[str]],
            embedding_dim=256, hidden_dim=512, context=3,
            epochs=10, batch_size=32, validation_split=0.1, dropout=None,
            learning_rate=1e-2, custom_loss=None
    ):
        if len(ingredients) != len(steps):
            raise ValueError(f"dimension mismatch {len(ingredients)} vs {len(steps)}")

        start_time = time.time()
        self.batch_size = batch_size
        self.context = context

        print("Preparing unique context-grams for training...")
        X_train, y_train, sample_weights = self._prepare_train(ingredients, steps)

        vocab_size = len(self.vocab)
        print(f"Vocab size: {vocab_size}")
        print(f"Unique training samples: {len(X_train)}")
        print(f"Data preparation took {time.time() - start_time:.2f} seconds")
        for i in range(10):
            print(f"Sample {i}: {X_train[i]} - {y_train[i]}")

        print("Building model...")
        self.model = keras.Sequential()
        self.model.add(keras.layers.Input(shape=(context,), dtype=tf.int32))
        self.model.add(keras.layers.Embedding(vocab_size * context, embedding_dim, name="embedding"))
        self.model.add(keras.layers.Flatten())
        self.model.add(keras.layers.BatchNormalization())
        self.model.add(keras.layers.Dense(hidden_dim, activation='relu', name="hidden"))

        if dropout is not None:
            self.model.add(keras.layers.Dropout(dropout))
        self.model.add(keras.layers.BatchNormalization())
        self.model.add(keras.layers.Dense(vocab_size, activation='softmax', name="output"))

        self.model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            loss=(custom_loss if custom_loss != None else tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)),
            metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
        )

        self.model.build(input_shape=(None, context))

        print(self.model.summary())
        
        train_dataset = tf.data.Dataset.from_tensor_slices(
            (X_train, y_train, sample_weights)
        ).shuffle(buffer_size=10000)
        
        val_size = int(len(X_train) * validation_split)
        train_size = len(X_train) - val_size
        
        train_ds = train_dataset.take(train_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)
        val_ds = train_dataset.skip(train_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)

        start_train_time = time.time()
        history = self.model.fit(
            train_ds,
            epochs=epochs,
            validation_data=val_ds,
            callbacks= [
                tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
                tf.keras.callbacks.ReduceLROnPlateau(factor=0.2, patience=2)
            ]
        )

        print(f"Training took {time.time() - start_train_time:.2f} seconds")
        print(f"Total process took {time.time() - start_time:.2f} seconds")

        return self

    def _prepare_test(self, ingredients: List[List[str]]):
        if self.vocab is None:
            raise ValueError("Vocabulary not built. Fit the model first!")
        
        all_contexts = []
        for ingredient_list in ingredients:
            ingredient_ids = [self.token_to_id.get(token, self.token_to_id['<UNKNOWN>']) 
                              for token in ingredient_list] + [self.token_to_id['<STEPS>']]
            
            if len(ingredient_ids) >= self.context - 1:
                context_tokens = ingredient_ids[-(self.context - 1):]
            else:
                padding = [self.token_to_id['<UNKNOWN>']] * (self.context - 1 - len(ingredient_ids))
                context_tokens = padding + ingredient_ids
            
            all_contexts.append(context_tokens)
        
        return np.array(all_contexts)

    def predict(self, ingredients: List[List[str]], max_steps=20) -> List[List[str]]:
        if self.model is None:
            raise ValueError("Model not fitted!")
    
        all_contexts = self._prepare_test(ingredients)
    
        num_ingredients = len(ingredients)
        current_tokens = [['<s>'] for _ in range(num_ingredients)]
        completed = [False] * num_ingredients
        
        for step in range(max_steps):
            if all(completed):
                break
            
            active_indices = [i for i, is_complete in enumerate(completed) if not is_complete]
            
            # Process in batches of self.batch_size
            for batch_start in range(0, len(active_indices), self.batch_size):
                batch_indices = active_indices[batch_start:batch_start + self.batch_size]
                
                batch_contexts = []
                for idx in batch_indices:
                    if len(current_tokens[idx]) == 1:
                        context = all_contexts[idx]
                    else:
                        recent_tokens = [self.token_to_id.get(token, self.token_to_id['<UNKNOWN>']) 
                                         for token in current_tokens[idx][1:]]
                        context_size = self.context - 1
                        
                        if len(recent_tokens) >= context_size:
                            context = recent_tokens[-context_size:]
                        else:
                            orig_context_needed = context_size - len(recent_tokens)
                            context = list(all_contexts[idx][-orig_context_needed:]) + recent_tokens
                    
                    batch_contexts.append(context)
                
                batch_input = np.array(batch_contexts)
                
                predictions = self.model.predict_on_batch(batch_input)
                
                for i, pred_idx in enumerate(batch_indices):
                    next_token_id = np.argmax(predictions[i])
                    next_token = self.id_to_token[next_token_id]
                    
                    current_tokens[pred_idx].append(next_token)
                    
                    if next_token == '</s>':
                        completed[pred_idx] = True
        
        all_predictions = []
        for tokens in current_tokens:
            if tokens[0] == '<s>':
                tokens = tokens[1:]
            
            if tokens and tokens[-1] == '</s>':
                tokens = tokens[:-1]
                
            all_predictions.append(tokens)
        
        return all_predictions

    def save(self, filepath: str):
        if self.model is None:
            raise ValueError("No model to save!")

        self.model.save(filepath)

        np.savez(f"{filepath}_vocab.npz", 
                 vocab=np.array(self.vocab, dtype=object),
                 context=np.array([self.context]),
                 batch_size=np.array([self.batch_size]),
        )

    def load(self, filepath: str):
        self.model = keras.models.load_model(filepath)

        data = np.load(f"{filepath}_vocab.npz", allow_pickle=True)

        self.vocab = data['vocab'].tolist()
        self.batch_size = int(data['batch_size'][0])
        self.context = int(data['context'][0])
        self.token_to_id = {token: idx for idx, token in enumerate(self.vocab)}
        self.id_to_token = {idx: token for idx, token in enumerate(self.vocab)}
        return self

In [54]:
model = OptimizedBatchFeedForwardNN()
model.fit(ingredients, steps, 
          embedding_dim=512, hidden_dim=256, 
          context=8, epochs=5, dropout=0.2, 
          batch_size=2048, learning_rate=1e-1,
)
model.save("/kaggle/working/kerastok_optimized_ffnn_512_embedding_256_hidden_8_context_1e-1_lr.keras")

Preparing unique context-grams for training...
Building vocabulary...
Found 8419399 unique context-grams
Building sample weights using context gram...
Vocab size: 39734
Unique training samples: 8570864
Data preparation took 45.26 seconds
Sample 0: [22750  6184  8942  3087   234    51   119    15] - 22748
Sample 1: [ 6184  8942  3087   234    51   119    15 22748] - 91
Sample 2: [ 8942  3087   234    51   119    15 22748    91] - 2
Sample 3: [ 3087   234    51   119    15 22748    91     2] - 914
Sample 4: [  234    51   119    15 22748    91     2   914] - 1
Sample 5: [   51   119    15 22748    91     2   914     1] - 1970
Sample 6: [  119    15 22748    91     2   914     1  1970] - 5
Sample 7: [   15 22748    91     2   914     1  1970     5] - 550
Sample 8: [22748    91     2   914     1  1970     5   550] - 696
Sample 9: [  91    2  914    1 1970    5  550  696] - 11
Building model...


None
Epoch 1/5




[1m3767/3767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m376s[0m 98ms/step - loss: 4.5940 - sparse_categorical_accuracy: 0.2416 - val_loss: 5.0003 - val_sparse_categorical_accuracy: 0.2230 - learning_rate: 0.1000
Epoch 2/5
[1m3767/3767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m373s[0m 99ms/step - loss: 4.5697 - sparse_categorical_accuracy: 0.2203 - val_loss: 4.2193 - val_sparse_categorical_accuracy: 0.2669 - learning_rate: 0.1000
Epoch 3/5
[1m3767/3767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m369s[0m 98ms/step - loss: 6.1209 - sparse_categorical_accuracy: 0.2319 - val_loss: 6.4841 - val_sparse_categorical_accuracy: 0.1543 - learning_rate: 0.1000
Epoch 4/5
[1m3767/3767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m374s[0m 99ms/step - loss: 5.0623 - sparse_categorical_accuracy: 0.2402 - val_loss: 6.5508 - val_sparse_categorical_accuracy: 0.1571 - learning_rate: 0.1000
Epoch 5/5
[1m3767/3767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m370s[0m 98ms/step - loss: 

## BPE FFNN

In [36]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from typing import List, Dict, Set, Tuple
import string
import pickle
import warnings
import ast
from collections import Counter, defaultdict
from transformers import AutoTokenizer

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

class BPEGeneratorPreprocessing:
    def __init__(self,
                 ingredients_col='ingredients_text',
                 steps_col='steps_string_standardize',
                 drop_uncommon=None,
                 vocab_size=256):
        self.ingredients_col = ingredients_col
        self.steps_col = steps_col
        self.stemmer = LancasterStemmer()
        self.drop_uncommon = drop_uncommon
        self.vocab_size = vocab_size
        
        self.vocab = []
        self.merges = {}
        self.word_freqs = defaultdict(int)
        self.splits = {}

    def get_savepath(self, prefix: str):
        return f"{prefix}bpe_preprocessing_{self.vocab_size}_vocab_size";

    def _extract_ingredients(self, row) -> List[str]:
        if pd.isna(row[self.ingredients_col]):
            print("  - No ingredients found (NaN value)")
            return []

        ingredients = ast.literal_eval(row[self.ingredients_col])
        stemmed = [self.stemmer.stem(ing) for ing in ingredients if ing]
        return stemmed

    def _extract_steps(self, row) -> List[str]:
        if pd.isna(row[self.steps_col]):
            print("  - No steps found (NaN value)")
            return []
    
        steps = row[self.steps_col].split(',')
        steps = [step.strip().lower() for step in steps]
    
        cleaned_stems = []
        for step in steps:
            cleaned = re.sub(r'[^a-zA-Z0-9 .]', '', step)
    
            if not cleaned or cleaned.isdigit():
                continue
    
            stemmed = self.stemmer.stem(cleaned)
            cleaned_stems.append(stemmed)
    
        return cleaned_stems
    
    def _collect_corpus(self, X: pd.DataFrame) -> List[str]:
        print(f"\nCollecting corpus from {len(X)} rows...")
        corpus = []
        for idx, row in X.iterrows():
            if idx % 100 == 0 and idx > 0:
                print(f"  - Processed {idx} rows, corpus size: {len(corpus)}")
            
            ingredients = self._extract_ingredients(row)
            steps = self._extract_steps(row)
            
            if ingredients:
                corpus.extend(ingredients)
            if steps:
                corpus.extend(steps)
                
        print(f"Final corpus size: {len(corpus)} tokens")
        print(f"Sample corpus entries: {corpus[:5]}")
        return corpus

    def _train_bpe(self, corpus: List[str]):
        print("\nTraining BPE tokenizer...")
        print("Building initial word frequencies...")
        for text in corpus:
            self.word_freqs[text] += 1
        
        print(f"Unique words in corpus: {len(self.word_freqs)}")
        print(f"Top 5 most common words: {Counter(self.word_freqs).most_common(5)}")
        
        print("\nBuilding initial alphabet...")
        alphabet = set()
        for word in self.word_freqs.keys():
            for letter in word:
                alphabet.add(letter)
        
        print(f"Alphabet size: {len(alphabet)}")
        
        self.vocab = ["<s>", "</s>", "<RECIPE>"] + sorted(list(alphabet))
        print(f"Initial vocabulary size: {len(self.vocab)}")
        
        print("\nInitializing splits (words into characters)...")
        self.splits = {word: [c for c in word] for word in self.word_freqs.keys()}
        
        # Apply BPE algorithm
        print("\nStarting BPE merge operations...")
        iteration = 0
        while len(self.vocab) < self.vocab_size:
            iteration += 1
            print(f"\nIteration {iteration}: Current vocab size = {len(self.vocab)}")
            
            print("  Computing pair frequencies...")
            pair_freqs = self._compute_pair_freqs()
            if not pair_freqs:
                print("  No more pairs to merge, stopping early")
                break
                
            best_pair = max(pair_freqs.items(), key=lambda x: x[1])[0]
            best_pair_freq = pair_freqs[best_pair]
            print(f"  Best pair: ('{best_pair[0]}', '{best_pair[1]}') with frequency {best_pair_freq}")
            
            print(f"  Merging pair '{best_pair[0]}' + '{best_pair[1]}'...")
            self.splits = self._merge_pair(best_pair[0], best_pair[1])
            
            self.merges[best_pair] = best_pair[0] + best_pair[1]
            self.vocab.append(best_pair[0] + best_pair[1])
            
            if iteration % 100 == 0:
                print(f"  Current vocab size: {len(self.vocab)}")
                print(f"  Latest 5 tokens: {self.vocab[-5:]}")
        
        print(f"\nBPE training complete. Final vocabulary size: {len(self.vocab)}")
        print(f"Sample vocabulary entries: {self.vocab[:10]}...{self.vocab[-10:]}")
    
    def _compute_pair_freqs(self):
        pair_freqs = defaultdict(int)
        for word, freq in self.word_freqs.items():
            split = self.splits[word]
            if len(split) == 1:
                continue
            for i in range(len(split) - 1):
                pair = (split[i], split[i + 1])
                pair_freqs[pair] += freq
        
        return pair_freqs
    
    def _merge_pair(self, a: str, b: str):
        new_splits = {}
        for word, split in self.splits.items():
            if len(split) == 1:
                new_splits[word] = split
                continue

            new_split = []
            i = 0
            while i < len(split):
                if i < len(split) - 1 and split[i] == a and split[i + 1] == b:
                    new_split.append(a + b)
                    i += 2
                else:
                    new_split.append(split[i])
                    i += 1
            new_splits[word] = new_split
        return new_splits

    def flatmap_tokens(self, nested_lists: List[List[List[str]]]) -> List[List[str]]:
        print(f"\nFlattening nested token structure...")
        result = []

        for recipe_idx, recipe_tokens in enumerate(nested_lists):
            if recipe_idx % 100 == 0 and recipe_idx > 0:
                print(f"  - Flattened {recipe_idx} recipes")
                
            flattened = []
            for item_tokens in recipe_tokens:
                flattened.extend(item_tokens)
            
            result.append(flattened)
            
        print(f"Flattening complete. Converted {len(nested_lists)} nested lists to flat token lists.")
        return result

    def tokenize(self, text: str) -> List[str]:
        chars = [c for c in text]
        
        i = 0
        merges_applied = 0
        while i < len(chars) - 1:
            pair = (chars[i], chars[i + 1])
            if pair in self.merges:
                chars = chars[:i] + [self.merges[pair]] + chars[i+2:]
                merges_applied += 1
            else:
                i += 1
        
        return chars

    def tokenize_list(self, text_list: List[str]) -> List[List[str]]:
        return [self.tokenize(text) for text in text_list]

    def fit(self, X: pd.DataFrame, y=None):
        print(f"\n{'='*50}")
        print(f"FITTING BPE TOKENIZER ON {len(X)} SAMPLES")
        print(f"{'='*50}")
        
        missing_cols = [col for col in [self.ingredients_col, self.steps_col] if col not in X.columns]
        if missing_cols:
            raise ValueError(f"Columns {missing_cols} not found in the DataFrame")

        corpus = self._collect_corpus(X)
        
        self._train_bpe(corpus)

        print(f"\nFit complete!")
        return self

    def transform(self, X: pd.DataFrame) -> Tuple[List[List[str]], List[List[str]]]:
        print(f"\n{'='*50}")
        print(f"TRANSFORMING {len(X)} SAMPLES")
        print(f"{'='*50}")
        
        X = X.dropna(subset=[self.ingredients_col, self.steps_col])
        print(f"After dropping NaN values: {len(X)} samples remain")
        
        ingredients_lists = []
        steps_lists = []
        ingredients_counter = defaultdict(int)

        print("\nExtracting ingredients and steps...")
        for idx, row in X.iterrows():
            if idx % 100 == 0 and idx > 0:
                print(f"  - Processed {idx} rows")
                
            ingredients = self._extract_ingredients(row)
            steps = self._extract_steps(row)
            
            for ing in ingredients:
                ingredients_counter[ing] += 1
                
            ingredients_lists.append(ingredients)
            steps_lists.append(steps)

        print(f"\nExtracted {len(ingredients_lists)} ingredient lists and {len(steps_lists)} step lists")
        
        if self.drop_uncommon is not None:
            print(f"\nFiltering uncommon ingredients (threshold: {self.drop_uncommon})...")
            total_ingredients = sum(len(ing_list) for ing_list in ingredients_lists)
            print(f"Total ingredients before filtering: {total_ingredients}")
            
            uncommon_ingredients = {
                ing for ing, count in ingredients_counter.items()
                if count < self.drop_uncommon
            }
            
            print(f"Found {len(uncommon_ingredients)} uncommon ingredients to filter")
            print(f"Examples of uncommon ingredients: {list(uncommon_ingredients)[:5]}")
    
            filtered_ingredients_lists = []
            filtered_steps_lists = []
            for ingredients, steps in zip(ingredients_lists, steps_lists):
                if not any(ing in uncommon_ingredients for ing in ingredients):
                    filtered_ingredients_lists.append(ingredients)
                    filtered_steps_lists.append(steps)
    
            print(f"After filtering: {len(filtered_ingredients_lists)} samples remain")
            ingredients_lists = filtered_ingredients_lists
            steps_lists = filtered_steps_lists

        print("\nTokenizing ingredients and steps...")
        tokenized_ingredients_lists = [self.tokenize_list(ingredients) for ingredients in ingredients_lists]
        tokenized_steps_lists = [self.tokenize_list(steps) for steps in steps_lists]
        
        print("\nFlattening token structures...")
        flat_ingredients_lists = self.flatmap_tokens(tokenized_ingredients_lists)
        flat_steps_lists = self.flatmap_tokens(tokenized_steps_lists)
        
        print(f"\nFinal result: {len(flat_ingredients_lists)} ingredient token lists, {len(flat_steps_lists)} step token lists")
        if flat_ingredients_lists and flat_steps_lists:
            print(f"Example ingredients tokens (first recipe): {flat_ingredients_lists[0][:10]}...")
            print(f"Example steps tokens (first recipe): {flat_steps_lists[0][:10]}...")
        
        print("\nTransform complete!")
        return flat_ingredients_lists, flat_steps_lists

    def fit_transform(self, X: pd.DataFrame, y=None) -> Tuple[List[List[str]], List[List[str]]]:
        print("\nPerforming fit_transform...")
        return self.fit(X).transform(X)

    def save_in_dir(self, directory: str):
        self.save(self.get_savepath(directory))

    def save(self, path: str):
        print(f"\nSaving tokenizer to {path}...")
        tokenizer_data = {
            'vocab': self.vocab,
            'merges': self.merges,
            'word_freqs': dict(self.word_freqs),
            'splits': self.splits
        }
        with open(path, 'wb') as f:
            pickle.dump(tokenizer_data, f)
        print("Tokenizer saved successfully!")
    
    def load(self, path: str):
        print(f"\nLoading tokenizer from {path}...")
        with open(path, 'rb') as f:
            tokenizer_data = pickle.load(f)
        self.vocab = tokenizer_data['vocab']
        self.merges = tokenizer_data['merges']
        self.word_freqs = defaultdict(int)
        self.word_freqs.update(tokenizer_data['word_freqs'])
        self.splits = tokenizer_data['splits']
        print(f"Tokenizer loaded successfully!")
        print(f"Vocabulary size: {len(self.vocab)}")
        print(f"Number of merges: {len(self.merges)}")


In [38]:
sample = 20000
bpe_generator_preprocessing = BPEGeneratorPreprocessing(vocab_size=80)
bpe_generator_preprocessing.fit(data.iloc[:sample])
ingredients, steps = bpe_generator_preprocessing.transform(data)
ingredients_array = np.array(ingredients, dtype=object)
steps_array = np.array(steps, dtype=object)
ingredients_array = np.array(ingredients, dtype=object)
steps_array = np.array(steps, dtype=object)
np.savez_compressed("/kaggle/working/bpe_80_tokenized_data.npz", ingredients=ingredients_array, steps=steps_array)


FITTING BPE TOKENIZER ON 20000 SAMPLES

Collecting corpus from 20000 rows...
  - Processed 100 rows, corpus size: 1534
  - Processed 200 rows, corpus size: 3248
  - Processed 300 rows, corpus size: 4758
  - Processed 400 rows, corpus size: 6269
  - Processed 500 rows, corpus size: 7839
  - Processed 600 rows, corpus size: 9543
  - Processed 700 rows, corpus size: 11095
  - Processed 800 rows, corpus size: 12896
  - Processed 900 rows, corpus size: 14504
  - Processed 1000 rows, corpus size: 16079
  - Processed 1100 rows, corpus size: 17720
  - No steps found (NaN value)
  - Processed 1200 rows, corpus size: 19370
  - Processed 1300 rows, corpus size: 20923
  - Processed 1400 rows, corpus size: 22663
  - Processed 1500 rows, corpus size: 24215
  - Processed 1600 rows, corpus size: 25995
  - Processed 1700 rows, corpus size: 27748
  - Processed 1800 rows, corpus size: 29324
  - Processed 1900 rows, corpus size: 30848
  - Processed 2000 rows, corpus size: 32427
  - Processed 2100 rows, c

In [39]:
data = np.load("/kaggle/working/bpe_80_tokenized_data.npz", allow_pickle=True)
ingredients, steps= data['ingredients'], data['steps']

In [43]:
data = np.load("/kaggle/working/bpe_tokenized_data.npz", allow_pickle=True)
ingredients, steps= data['ingredients'], data['steps']

In [45]:
model = OptimizedBatchFeedForwardNN()
model.fit(ingredients, steps, 
          embedding_dim=256, hidden_dim=128, 
          context=5, epochs=5, dropout=0.2, 
          batch_size=2048, learning_rate=1e-1,
)
model.save("/kaggle/working/bpe_64_optimized_ffnn_256_embedding_128_hidden_5_context_1e-1_lr.keras")

Preparing unique context-grams for training...
Building vocabulary...
Found 816158 unique context-grams
Building sample weights using context gram...
Vocab size: 64
Unique training samples: 1898344
Data preparation took 43.17 seconds
Sample 0: [63 26 17  9  1] - 16
Sample 1: [26 17  9  1 16] - 5
Sample 2: [26 17  9  1 16] - 3
Sample 3: [26 17  9  1 16] - 2
Sample 4: [26 17  9  1 16] - 18
Sample 5: [26 17  9  1 16] - 0
Sample 6: [26 17  9  1 16] - 12
Sample 7: [26 17  9  1 16] - 19
Sample 8: [26 17  9  1 16] - 1
Sample 9: [26 17  9  1 16] - 11
Building model...


None
Epoch 1/5
[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 16ms/step - loss: 0.5329 - sparse_categorical_accuracy: 0.1172 - val_loss: 2.1940 - val_sparse_categorical_accuracy: 0.3395 - learning_rate: 0.1000
Epoch 2/5
[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - loss: 0.4637 - sparse_categorical_accuracy: 0.1369 - val_loss: 2.1077 - val_sparse_categorical_accuracy: 0.3507 - learning_rate: 0.1000
Epoch 3/5
[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 14ms/step - loss: 0.4598 - sparse_categorical_accuracy: 0.1380 - val_loss: 2.3128 - val_sparse_categorical_accuracy: 0.3412 - learning_rate: 0.1000
Epoch 4/5
[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 14ms/step - loss: 0.4511 - sparse_categorical_accuracy: 0.1410 - val_loss: 2.0384 - val_sparse_categorical_accuracy: 0.3535 - learning_rate: 0.1000
Epoch 5/5
[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - loss: 

#### Evaluate

In [None]:
pred_steps = model.predict(ingredients[1])