# **Importing Libraries**

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import re
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

In [12]:
# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

True

# 1. Data Exploration and Preprocessing

In [13]:
def load_and_preprocess_data(file_path):
    # Load the dataset
    df = pd.read_csv('/kaggle/input/recipenlg-dataset/full_dataset.csv')
    
    print("Columns in the dataset:", df.columns.tolist())
    
    # Check if 'Directions' column exists, if not, try to find a similar column
    directions_column = 'Directions'
    if 'Directions' not in df.columns:
        possible_columns = [col for col in df.columns if 'direction' in col.lower() or 'instruction' in col.lower()]
        if possible_columns:
            directions_column = possible_columns[0]
            print(f"Using '{directions_column}' as the directions column.")
        else:
            raise ValueError("Could not find a suitable column for recipe directions.")
    
    # Data Cleaning
    df.dropna(subset=[directions_column], inplace=True)
    df.drop_duplicates(subset=[directions_column], inplace=True)
    
    # Text Processing
    stop_words = set(stopwords.words('english'))
    
    def preprocess_text(text):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Tokenize
        tokens = text.split()
        # Remove stop words
        tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    
    df['processed_directions'] = df[directions_column].apply(preprocess_text)
    
    return df

In [14]:
def load_and_preprocess_data(file_path, sample_fraction=0.05):
    # Load the dataset
    df = pd.read_csv('/kaggle/input/recipenlg-dataset/full_dataset.csv')
    
    # Sample a fraction of the data
    df = df.sample(frac=sample_fraction, random_state=42)
    
    print("Columns in the dataset:", df.columns.tolist())
    
    # Check if 'Directions' column exists, if not, try to find a similar column
    directions_column = 'Directions'
    if 'Directions' not in df.columns:
        possible_columns = [col for col in df.columns if 'direction' in col.lower() or 'instruction' in col.lower()]
        if possible_columns:
            directions_column = possible_columns[0]
            
    # Data Cleaning
    df.dropna(subset=[directions_column], inplace=True)
    df.drop_duplicates(subset=[directions_column], inplace=True)
    
    # Text Processing
    stop_words = set(stopwords.words('english'))
    
    def preprocess_text(text):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Tokenize
        tokens = text.split()
        # Remove stop words
        tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    
    df['processed_directions'] = df[directions_column].apply(preprocess_text)
    
    return df


# 2. Data Preparation

In [15]:
def prepare_sequences(texts, max_sequence_length, max_words):
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(texts)
    
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre', truncating='pre')
    
    return padded_sequences, tokenizer

# 3. Model Building

In [16]:
def build_model(vocab_size, embedding_dim, max_sequence_length):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
        LSTM(128, return_sequences=True),
        Dropout(0.2),
        LSTM(128),
        Dropout(0.2),
        Dense(vocab_size, activation='softmax')
    ])
    
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

# 4. Training

In [17]:
def train_model(model, X_train, y_train, epochs, batch_size):
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)
    return history

# 5. Recipe Generation

In [18]:
def generate_recipe(model, tokenizer, seed_text, max_sequence_length, num_words):
    generated_text = seed_text
    for _ in range(num_words):
        encoded = tokenizer.texts_to_sequences([generated_text])[0]
        encoded = pad_sequences([encoded], maxlen=max_sequence_length, padding='pre')
        
        pred = model.predict(encoded, verbose=0)
        pred_word = tokenizer.index_word[np.argmax(pred)]
        
        generated_text += ' ' + pred_word
        
        if pred_word == '.':
            break
    
    return generated_text

# Visualization function

In [19]:
def plot_training_history(history):
    plt.figure(figsize=(12, 4))
    
    plt.subplot(121)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.subplot(122)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('training_history.png')
    plt.close()

# Main execution

In [20]:
if __name__ == "__main__":
    # Parameters
    file_path = '/kaggle/input/recipenlg-dataset/full_dataset.csv'
    max_sequence_length = 100
    max_words = 10000
    embedding_dim = 100
    epochs = 30
    batch_size = 128
    
    try:
        # 1. Data Exploration and Preprocessing
        df = load_and_preprocess_data(file_path)
        
        # Print some information about the dataset
        print("\nDataset Info:")
        print(df.info())
        print("\nSample processed directions:")
        print(df['processed_directions'].head())
        
        # 2. Data Preparation
        padded_sequences, tokenizer = prepare_sequences(df['processed_directions'], max_sequence_length, max_words)
        
        # Prepare input sequences and target words
        X = padded_sequences[:, :-1]
        y = padded_sequences[:, -1]
        y = tf.keras.utils.to_categorical(y, num_classes=max_words)
        
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # 3. Model Building
        model = build_model(max_words, embedding_dim, max_sequence_length-1)
        
        
        # 4. Training
        history = train_model(model, X_train, y_train, epochs, batch_size)
        
        # Visualize training history
        plot_training_history(history)
        print("Training history plot saved as 'training_history.png'")
        
        # 5. Evaluation and Recipe Generation
        # Evaluate the model
        test_loss, test_accuracy = model.evaluate(X_test, y_test)
        print(f"\nTest Accuracy: {test_accuracy:.4f}")
        
        # Generate a recipe
        seed_text = "to make chicken soup"
        generated_recipe = generate_recipe(model, tokenizer, seed_text, max_sequence_length-1, 50)
        print("\nGenerated Recipe:")
        print(generated_recipe)
        
        # 6. Documentation and Reporting
        # Save the model
        model.save('recipe_generation_model.h5')
        
        # Save the tokenizer
        import pickle
        with open('tokenizer.pickle', 'wb') as handle:
            pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
        print("\nModel and tokenizer saved. Don't forget to write a detailed report!")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print("Please check the dataset and column names.")

Columns in the dataset: ['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source', 'NER']

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 111368 entries, 2015528 to 333594
Data columns (total 8 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Unnamed: 0            111368 non-null  int64 
 1   title                 111368 non-null  object
 2   ingredients           111368 non-null  object
 3   directions            111368 non-null  object
 4   link                  111368 non-null  object
 5   source                111368 non-null  object
 6   NER                   111368 non-null  object
 7   processed_directions  111368 non-null  object
dtypes: int64(1), object(7)
memory usage: 7.6+ MB
None

Sample processed directions:
2015528    remove tenderloin steak score meat combine rem...
1608734    combine ingredients slow cooker quarts bury ch...
778500     cook carrots cut crosswise inch pieces add b



Epoch 1/30
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 27ms/step - accuracy: 0.0744 - loss: 6.4374 - val_accuracy: 0.1012 - val_loss: 5.5738
Epoch 2/30
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.1091 - loss: 5.4502 - val_accuracy: 0.1528 - val_loss: 5.1555
Epoch 3/30
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.1566 - loss: 5.0283 - val_accuracy: 0.1943 - val_loss: 4.7967
Epoch 4/30
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.2004 - loss: 4.6603 - val_accuracy: 0.2330 - val_loss: 4.5322
Epoch 5/30
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.2375 - loss: 4.3750 - val_accuracy: 0.2610 - val_loss: 4.3531
Epoch 6/30
[1m557/557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.2633 - loss: 4.1783 - val_accuracy: 0.2777 - val_loss: 4.2460
Epoch 7/30
[1m5

In [21]:
!pip install streamlit


  pid, fd = os.forkpty()


Collecting streamlit
  Downloading streamlit-1.38.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.2-py3-none-manylinux2014_x86_64.whl.metadata (38 kB)
Downloading streamlit-1.38.0-py2.py3-none-any.whl (8.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m75.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m86.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading watchdog-4.0.2-py3-none-manylinux2014_x86_64.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.9/82.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: watchdog, pydeck, streamlit
Successfully instal

In [25]:
import pickle

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [26]:
import streamlit as st
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the model and tokenizer
model = tf.keras.models.load_model('/kaggle/working/recipe_generation_model.h5')
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

max_sequence_len = 100  # Set this to the value used during training

st.title('Recipe Generation App')

input_text = st.text_input('Enter the start of your recipe:')

if st.button('Generate Recipe'):
    if input_text:
        # Preprocess the input text
        token_list = tokenizer.texts_to_sequences([input_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

        # Generate the recipe
        predicted = model.predict_classes(token_list, verbose=0)

        # Decode the prediction to text
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        st.write('Generated Recipe:', input_text + " " + output_word)
    else:
        st.write('Please enter some text to start the recipe.')


2024-09-04 06:24:24.753 
  command:

    streamlit run /opt/conda/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-09-04 06:24:24.757 Session state does not function when running a script without `streamlit run`


In [30]:
!streamlit run /opt/conda/lib/python3.10/site-packages/ipykernel_launcher.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.19.2.2:8501[0m
[34m  External URL: [0m[1mhttp://35.230.2.226:8501[0m
[0m
^C
[34m  Stopping...[0m


In [29]:
!streamlit run app.py


Usage: streamlit run [OPTIONS] TARGET [ARGS]...
Try 'streamlit run --help' for help.

Error: Invalid value: File does not exist: app.py
