<a href="https://colab.research.google.com/github/Sreejith-nair511/Summer_course_Ai/blob/main/Document_from__.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install nltk tensorflow pandas numpy

import nltk
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer # calling the library
# Custom tokenizer creation for LSTM and Transformer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download the Brown Corpus and Punkt tokenizer
nltk.download('brown')
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.corpus import brown
from nltk.tokenize import word_tokenize

# Load a few paragraphs from a specific category (e.g., 'news')
# You can change the categories or add more to use more data
paragraphs = brown.paras(categories='news')
# Flatten the list of paragraphs into a list of sentences, and then into a list of words
# Use fewer paragraphs for a simple example
num_paragraphs_to_use = 50
sentences = []
for para in paragraphs[:num_paragraphs_to_use]:
    for sent in para:
        sentences.append(" ".join(sent)) # Join words back into a sentence string

# Combine all sentences into a single text string
text = " ".join(sentences)

# Tokenize the text by words
words = word_tokenize(text.lower()) # Convert to lowercase for consistency
print("Number of tokenized Words:")
print(len(words))

# Create a vocabulary
tokenizer = Tokenizer(oov_token="<OOV>") # Handle out-of-vocabulary words
tokenizer.fit_on_texts([words])
total_words = len(tokenizer.word_index) + 1 # Add 1 for the OOV token or padding if used

print("Number of tokenizer tokenized words")
print(total_words)

# Create input sequences and corresponding next words
input_sequences = []
for i in range(1, len(words)):
    sequence = words[max(0, i-20):i] # Use a window of the last 20 words as input
    input_sequences.append(sequence)

# Convert sequences of words to sequences of integers
sequences = tokenizer.texts_to_sequences(input_sequences)

# Determine the maximum sequence length
max_sequence_len = max([len(x) for x in sequences])

# Pad sequences so they all have the same length
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding='pre')

# Split into input and output
# The input is all but the last token in each padded sequence
xs = padded_sequences[:,:-1]
# The output is the last token in each padded sequence
ys = padded_sequences[:,-1]

# Convert output to one-hot encoding
ys = to_categorical(ys, num_classes=total_words)

# Build the LSTM model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1)) # Embedding layer
model.add(LSTM(150)) # LSTM layer with 150 units
model.add(Dense(total_words, activation='softmax')) # Output layer with softmax activation

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model (using fewer epochs for a quick example)
model.fit(xs, ys, epochs=20, verbose=1)

print("Model training complete.")



[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Number of tokenized Words:
1662
Number of tokenizer tokenized words
631
Epoch 1/20




[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 37ms/step - accuracy: 0.0552 - loss: 6.2943
Epoch 2/20
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.0695 - loss: 5.5627
Epoch 3/20
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 56ms/step - accuracy: 0.0705 - loss: 5.6003
Epoch 4/20
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 37ms/step - accuracy: 0.0773 - loss: 5.4701
Epoch 5/20
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.0697 - loss: 5.4516
Epoch 6/20
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.0720 - loss: 5.2545
Epoch 7/20
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.1004 - loss: 5.1537
Epoch 8/20
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 63ms/step - accuracy: 0.1143 - loss: 4.9610
Epoch 9/20
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [2]:
# Save the trained model weights and tokenizer for later use
model.save_weights('brown_lstm.weights.h5')
import pickle
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('max_sequence_len.pkl', 'wb') as handle: # pkl: Pickle
    pickle.dump(max_sequence_len, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Model weights, tokenizer, and sequence length saved.")

Model weights, tokenizer, and sequence length saved.


In [3]:
import numpy as np
# Load the saved model weights, tokenizer, and max_sequence_len
try:
    model.load_weights('brown_lstm.weights.h5')
    with open('tokenizer.pkl', 'rb') as handle:
        tokenizer = pickle.load(handle)
    with open('max_sequence_len.pkl', 'rb') as handle:
        max_sequence_len = pickle.load(handle)
    print("Model weights, tokenizer, and sequence length loaded successfully.")
except FileNotFoundError:
    print("Error loading saved files. Please run the training code first.")

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

def generate_next_words(seed_text, num_words_to_generate):
    """Generates a sequence of predicted next words given a seed text."""
    generated_text = seed_text
    for _ in range(num_words_to_generate):
        # Convert the seed text to a sequence of integers and pad
        sequence = tokenizer.texts_to_sequences([generated_text])[0]
        padded_sequence = pad_sequences([sequence], maxlen=max_sequence_len-1, padding='pre')

        # Predict the next word probabilities
        predicted_probabilities = model.predict(padded_sequence, verbose=0)[0]
        print("Nicely Printing Predicted Probabilities:")
        print(predicted_probabilities) # How many predicted probabiilities would be there?
        print("Length of Predicted Probabilities:")
        print(len(predicted_probabilities))

        # Get the indices of words with the highest probabilities
        predicted_word_indices = np.argsort(predicted_probabilities)[::-1]
        # Sampling Methods in Generative AI:
        # Beam Search
        # Nucleus Sampling
        # Top-k Sampling
        # LSTM with Beam Search

        predicted_word = None
        # Iterate through predicted words in descending order of probability
        for idx in predicted_word_indices:
            # Get the word from the tokenizer's word index
            next_word = tokenizer.index_word.get(idx)

            # Check if the predicted word exists, is a word, and is not a stop word
            if next_word and next_word.isalpha() and next_word not in stop_words:
                predicted_word = next_word
                break # Found a suitable word, break the loop
        # You are making LSTM into a model that generate coherent sentence
        # If a suitable word was found, append it to the generated text
        if predicted_word:
            generated_text += " " + predicted_word
        else:
            # If no suitable word is found, try the next most probable word
            # This could be improved to handle cases where many predicted words are stop words or non-words
            pass

    return generated_text

# Get some sample sentences from the brown corpus for testing
# You can select sentences based on your needs
sample_sentences = []
for para in brown.paras(categories='news')[:5]: # Use first 5 paragraphs for testing
    for sent in para:
        sample_sentences.append(" ".join(sent).lower())

# Test the model with a few sample sentences
num_words_to_generate = 5 # Number of words to generate after the seed text

print("\nTesting the trained model:")
for i, seed_sentence in enumerate(sample_sentences[:3]): # Test with first 3 sample sentences
    print(f"\nSeed text {i+1}: '{seed_sentence}'")
    generated_sequence = generate_next_words(seed_sentence, num_words_to_generate)
    print(f"Generated sequence: '{generated_sequence}'")


Model weights, tokenizer, and sequence length loaded successfully.

Testing the trained model:

Seed text 1: 'the fulton county grand jury said friday an investigation of atlanta's recent primary election produced `` no evidence '' that any irregularities took place .'


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Nicely Printing Predicted Probabilities:
[1.50279249e-07 1.92821076e-07 1.61750969e-02 2.69559115e-01
 1.18064828e-01 5.36995521e-03 2.91573890e-02 3.26024815e-02
 2.91421078e-02 6.86509013e-02 1.79628544e-02 2.49290764e-02
 2.04944238e-02 1.17098163e-04 5.17111621e-04 4.99125104e-03
 8.91929085e-04 8.02474283e-03 2.76184897e-03 2.58800667e-02
 7.97268993e-04 6.66131731e-04 2.64200680e-05 1.74813526e-04
 7.87024765e-05 3.72798974e-03 8.80770013e-03 1.63695868e-03
 1.92639593e-03 3.86096118e-03 8.64789108e-05 1.06882158e-04
 3.85407446e-04 4.37938143e-04 2.24682894e-02 7.62769851e-05
 5.05742501e-05 2.17020176e-02 2.88510639e-02 6.89490407e-04
 7.24482466e-04 8.72167933e-04 5.94378170e-03 1.32461777e-04
 3.37508012e-04 4.17819945e-03 1.76681986e-03 2.32224609e-03
 3.41032934e-03 7.81506591e-04 1.37390736e-02 6.86267763e-03
 3.46366403e-04 6.13106880e-04 1.39223175e-05 4.59820090e-04
 1.00611622e-04 1.13495262e-05 5.53990481e-03 6.39948575e-03
 2.56821426e-04 8.51104924e-05 2.82904762e-0