# **Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import re
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

In [3]:
# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

True

# 1. Data Exploration and Preprocessing


In [4]:
def load_and_preprocess_data(file_path):
    # Load the dataset
    df = pd.read_csv('/kaggle/input/recipenlg-dataset/full_dataset.csv')
    
    print("Columns in the dataset:", df.columns.tolist())
    
    # Check if 'Directions' column exists, if not, try to find a similar column
    directions_column = 'Directions'
    if 'Directions' not in df.columns:
        possible_columns = [col for col in df.columns if 'direction' in col.lower() or 'instruction' in col.lower()]
        if possible_columns:
            directions_column = possible_columns[0]
            print(f"Using '{directions_column}' as the directions column.")
        else:
            raise ValueError("Could not find a suitable column for recipe directions.")
    
    # Data Cleaning
    df.dropna(subset=[directions_column], inplace=True)
    df.drop_duplicates(subset=[directions_column], inplace=True)
    
    # Text Processing
    stop_words = set(stopwords.words('english'))
    
    def preprocess_text(text):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Tokenize
        tokens = text.split()
        # Remove stop words
        tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    
    df['processed_directions'] = df[directions_column].apply(preprocess_text)
    
    return df

# 2. Data Preparation


In [5]:
def prepare_sequences(texts, max_sequence_length, max_words):
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(texts)
    
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre', truncating='pre')
    
    return padded_sequences, tokenizer

# 3. Model Building


In [6]:
def build_model(vocab_size, embedding_dim, max_sequence_length):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
        LSTM(128, return_sequences=True),
        Dropout(0.2),
        LSTM(128),
        Dropout(0.2),
        Dense(vocab_size, activation='softmax')
    ])
    
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model


# 4. Training


In [7]:
def train_model(model, X_train, y_train, epochs, batch_size):
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)
    return history

# 5. Recipe Generation


In [8]:
def generate_recipe(model, tokenizer, seed_text, max_sequence_length, num_words):
    generated_text = seed_text
    for _ in range(num_words):
        encoded = tokenizer.texts_to_sequences([generated_text])[0]
        encoded = pad_sequences([encoded], maxlen=max_sequence_length, padding='pre')
        
        pred = model.predict(encoded, verbose=0)
        pred_word = tokenizer.index_word[np.argmax(pred)]
        
        generated_text += ' ' + pred_word
        
        if pred_word == '.':
            break
    
    return generated_text

# Visualization function


In [9]:
def plot_training_history(history):
    plt.figure(figsize=(12, 4))
    
    plt.subplot(121)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.subplot(122)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('training_history.png')
    plt.close()

# Main execution


In [None]:
if __name__ == "__main__":
    # Parameters
    file_path = '/kaggle/input/recipenlg-dataset/full_dataset.csv'
    max_sequence_length = 100
    max_words = 10000
    embedding_dim = 100
    epochs = 50
    batch_size = 128
    
    try:
        # 1. Data Exploration and Preprocessing
        df = load_and_preprocess_data(file_path)
        
        # Print some information about the dataset
        print("\nDataset Info:")
        print(df.info())
        print("\nSample processed directions:")
        print(df['processed_directions'].head())
        
        # 2. Data Preparation
        padded_sequences, tokenizer = prepare_sequences(df['processed_directions'], max_sequence_length, max_words)
        
        # Prepare input sequences and target words
        X = padded_sequences[:, :-1]
        y = padded_sequences[:, -1]
        y = tf.keras.utils.to_categorical(y, num_classes=max_words)
        
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # 3. Model Building
        model = build_model(max_words, embedding_dim, max_sequence_length-1)
        
        # 4. Training
        history = train_model(model, X_train, y_train, epochs, batch_size)
        
        # Visualize training history
        plot_training_history(history)
        print("Training history plot saved as 'training_history.png'")
        
        # 5. Evaluation and Recipe Generation
        # Evaluate the model
        test_loss, test_accuracy = model.evaluate(X_test, y_test)
        print(f"\nTest Accuracy: {test_accuracy:.4f}")
        
        # Generate a recipe
        seed_text = "to make chicken soup"
        generated_recipe = generate_recipe(model, tokenizer, seed_text, max_sequence_length-1, 50)
        print("\nGenerated Recipe:")
        print(generated_recipe)
        
        # 6. Documentation and Reporting
        # Save the model
        model.save('recipe_generation_model.h5')
        
        # Save the tokenizer
        import pickle
        with open('tokenizer.pickle', 'wb') as handle:
            pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
        print("\nModel and tokenizer saved. Don't forget to write a detailed report!")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print("Please check the dataset and column names.")

Columns in the dataset: ['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source', 'NER']
Using 'directions' as the directions column.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 2211644 entries, 0 to 2231141
Data columns (total 8 columns):
 #   Column                Dtype 
---  ------                ----- 
 0   Unnamed: 0            int64 
 1   title                 object
 2   ingredients           object
 3   directions            object
 4   link                  object
 5   source                object
 6   NER                   object
 7   processed_directions  object
dtypes: int64(1), object(7)
memory usage: 151.9+ MB
None

Sample processed directions:
0    heavy quart saucepan mix brown sugar nuts evap...
1    place chipped beef bottom baking dish place ch...
2    slow cooker combine ingredients cover cook low...
3    boil debone chicken put bite size pieces avera...
4    combine first four ingredients press x inch un...
Name: processed_directions, d