In [None]:
import PyPDF2
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

<h3>Text extraction and Preprocessing</h3>

In [None]:
def extract_text_from_pdf(pdf_paths):
    text = ""
    for path in pdf_paths:
        with open(path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    return text

def clean_text(text):
    text = text.lower()   
    text = re.sub(r'(?<![a-zA-Z0-9])[^a-zA-Z0-9]+(?![a-zA-Z0-9])', '', text)
    
    # 2. Collapse multiple spaces into one
    # text = re.sub(r'\s+', ' ', text).strip()
    
    return text
#If you want to add more pdf text data, do it here:
pdf_paths = ["text_data/Fundamentals of a healthy and sustainable diet.pdf", "text_data/Essentials of Healthy Eating.pdf", "text_data/Dietary_Guidelines_for_Americans_2020-2025.pdf"]
raw_text = extract_text_from_pdf(pdf_paths)

#get the web scrapped text data here
text_file = open("webtext.txt", "r")
webtext = text_file.read()
raw_text += webtext


raw_text = clean_text(raw_text)

raw_text = raw_text.replace('\n', '')
raw_text = re.split(r'(?<=[.!?]) +', raw_text)




<h3>Tokenize the words</h3>
<p>Note: Its worth it to try and write our own tokenizer</p>

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(raw_text)
word_counts = tokenizer.word_counts
words = 5000#len(tokenizer.word_index) + 1
rare_words = [word for word, count in word_counts.items() if count == 1]


print(words)


<h3>Create N-gram sequences and input/output sequences</h3>

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

inputs = []
for line in raw_text:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram = token_list[:i+1]
        inputs.append(n_gram)

max_sequence_len = 40 #max([len(seq) for seq in inputs])
print(max_sequence_len)
inputs = np.array(pad_sequences(inputs, maxlen=max_sequence_len, padding='pre'))


display(inputs)
X = inputs[:, :-1]
y = inputs[:, -1]
#For cross_entropy uncomment below  
#y = np.array(tf.keras.utils.to_categorical(y, num_classes=words))




<h3>Define the model</h3>

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# model = Sequential()
# model.add(Embedding(words, 50, input_length=max_sequence_len-1))
# model.add(LSTM(150))
# model.add(Dense(words, activation='softmax'))
# model.build(input_shape=(None, max_sequence_len - 1)) 

model = Sequential()
model.add(Embedding(words, 100, input_length=max_sequence_len-1))  # embedding size 100
model.add(LSTM(150, return_sequences=False))  # still 1 LSTM, no stacking
model.add(Dropout(0.2))  # small dropout to prevent overfitting
model.add(Dense(words, activation='softmax'))
model.build(input_shape=(None, max_sequence_len-1))
print(model.summary())




None


<h3>Train the model</h3>

In [None]:
import sklearn
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from keras.optimizers import Adam
optimizer = Adam(learning_rate=0.001)

#model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])


#split into training sets and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=42)

history = model.fit(
  X_train, y_train,
  validation_data=(X_val, y_val),
  batch_size=32, #you can up this number to 168 will add a boost in training time, but drop in accuracy
  epochs=50, # If youre adding more text data, good idea to start w lower number to see how effecieint it is training 
  callbacks=[EarlyStopping(patience=5, restore_best_weights=True)] #This should stop the training if underfitting is happening
)

Epoch 1/50
[1m2929/2929[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 29ms/step - accuracy: 0.0478 - loss: 6.8178 - val_accuracy: 0.1137 - val_loss: 5.9310
Epoch 2/50
[1m1550/2929[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m36s[0m 27ms/step - accuracy: 0.1274 - loss: 5.7066

<h3>Plot the metrics</h3>
<h>The goal here is to keep the Validatin Accuracy close to the Training accuracy so that model doesnt underfit aka just perform well on the training data and bad on any unknown data</h>

In [None]:
import matplotlib.pyplot as plt
history = model.history
plt.plot(history.history['accuracy'], label='Training Accuracy')

plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

<h3>Save and test the model</h3>

In [None]:
import pickle

#NOTE*: to save the model so you dont have to retrain , remember to uncomment here and save it , make sure you change the name
# pickle is how you save the tokenizer so that the model can utilize it in future use
# Example of how to load it up is in TestingModel.ipynb
#*-----------------UNCOMMENT ONCE TRAINING IS DONE---------------*
model.save('nutri_model_v1.keras')

with open('nutri_tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

seed_text = "Good Sourcs of Protein include"

print("Input Sentence: " , seed_text)
next_words = 20

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word


print("Next predicted words:", seed_text)

In [13]:
from keras.models import load_model
model2 = load_model('my_model.keras')

  saveable.load_own_variables(weights_store.get(inner_path))
