RNN for Language Model

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from keras.models import save_model, load_model
import re

# Step 1: Prepare the Dataset
corpus_folder = '/content/drive/MyDrive/Datasets/Budget_speech/TXT'

def read_corpus(corpus_folder='/content/drive/MyDrive/Datasets/Budget_speech/TXT'):
    corpus_text = ''
    for filename in os.listdir(corpus_folder):
        file_path = os.path.join(corpus_folder, filename)
        with open(file_path, "r") as f:
            corpus_text += f.read().lower()
    return corpus_text

# Step 2: Text Preprocessing

def process_text():
    corpus_text = read_corpus()
    filters = '!"#$%&()*+/:;<=>?@[\\]^_`{|}~'
    tokenizer = Tokenizer(filters=filters)
    tokenizer.fit_on_texts([corpus_text])

    vocab_size = len(tokenizer.word_index) + 1
    sequences = tokenizer.texts_to_sequences([corpus_text])[0]
    return vocab_size, sequences, tokenizer

# Step 3: Prepare Training Data


def build_model():
    vocab_size, sequences,tokenizer = process_text()
    sequence_length = 10
    sequences = np.array(sequences)
    X = []
    y = []

    for i in range(sequence_length, len(sequences)):
        sequence = sequences[i-sequence_length:i]
        target = sequences[i]
        X.append(sequence)
        y.append(target)

    X = np.array(X)
    y = np.array(y)

    # Pad sequences if needed
    X = pad_sequences(X, maxlen=sequence_length)

    # Step 4: Build the Language Model
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=sequence_length))
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(128))
    model.add(Dense(vocab_size, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
    return X,y, model

def train():

    X,y,model = build_model()
    #we can add {epoch:02d}-{loss:.4f} to the filename to create epoch-wise chp files
    check_point_file = '/content/drive/MyDrive/Datasets/Budget_speech/Models/model_checkpoint.30+{epoch:02d}-{loss:.4f}.h5'
    if os.path.isfile('/content/drive/MyDrive/Datasets/Budget_speech/Models/model_checkpoint.10+20-1.3278.h5'):
      model.load_weights('/content/drive/MyDrive/Datasets/Budget_speech/Models/model_checkpoint.10+20-1.3278.h5')
    # Step 5: Train the Language Model

    checkpoint_callback = ModelCheckpoint(check_point_file,
                                        save_weights_only=False,
                                        save_best_only=True,
                                        monitor='val_loss')

    model.summary()

    # checkpoint_callback = ModelCheckpoint('./Models/model_checkpoint.h5',
    #                                       save_weights_only=False, save_best_only=True)
    # model.fit(X, y, batch_size=128, epochs=2, callbacks=[checkpoint_callback])
    model.fit(X, y, batch_size=128, epochs=10,
            validation_data=(X, y),
            callbacks=[checkpoint_callback])

    # model.save('/content/drive/MyDrive/Datasets/Budget_speech/Models/Keras_RNN.LM')

def generate_sentences(seed_text, num_sentences, sequence_length):
    _,_,tokenizer = process_text()
    _,_,model = build_model()
    check_point_file = '/content/drive/MyDrive/Datasets/Budget_speech/Models/model_checkpoint.h5'
    if os.path.isfile('/content/drive/MyDrive/Datasets/Budget_speech/Models/model_checkpoint.10+20-1.3278.h5'):
      model.load_weights(check_point_file)


    generated_text = seed_text + ': '
    for _ in range(num_sentences):
        for _ in range(sequence_length):
            input_sequence = tokenizer.texts_to_sequences([seed_text])[0]
            input_sequence = np.array(input_sequence)
            input_sequence = pad_sequences([input_sequence], maxlen=sequence_length)

            predicted_index = np.argmax(model.predict(input_sequence))
            predicted_word = tokenizer.index_word[predicted_index]

            seed_text += " " + predicted_word

            if predicted_word == '.':
                break

        generated_text += seed_text
        seed_text = predicted_word

    return generated_text


train()

seed_text = "budget speech"
num_sentences = 5
generated_text_length = 100
sequence_length = 10

# model = load_model('/content/drive/MyDrive/Datasets/Budget_speech/Models/Keras_RNN.LM')
generated_text = generate_sentences(seed_text, num_sentences, sequence_length)
print(generated_text)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 100)           8927100   
                                                                 
 lstm (LSTM)                 (None, 10, 128)           117248    
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 89271)             11515959  
                                                                 
Total params: 20,691,891
Trainable params: 20,691,891
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
1490/7079 [=====>........................] - ETA: 2:45 - loss: 1.1826

**After 1 iteration**

budget speech: budget speech the government is proposed to provide that the government isis being made to the extent of the states and thethe manufacture of duty on the income tax act is beingbeing reduced from 10 to 20 per cent to 10 perper cent to 10 per cent to 10 per cent to

**After 10 iterations**

budget speech: budget speech with india’s respective things, provide substantial
employment. a new rural developmentdevelopment of andhra pradesh and other
disadvantaged women shgs have been completedcompleted recovery of jammu kashmir farmers have been completed. the
flagship newnew ‘updated city city blocks and assam and tamil nadu, uttaruttar pradesh, jammu petrochemicals, gas technology development of
cctv and is notified