Penn Treebank (PTB) Dataset:

This is a well-known dataset for language modeling tasks.
It consists of a large collection of text from Wall Street Journal articles, making it suitable for predicting the next word in a sentence.

In [7]:
import numpy as np
import pandas as pd

In [8]:

import nltk
nltk.download('brown')


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [9]:
from nltk.corpus import brown

# Load sentences from the Brown corpus
sentences = brown.sents()

# Example: Print the first sentence
print(' '.join(sentences[0]))


The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .


In [10]:
sentences[0]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [11]:
# Join sentences to form a single text
final_sentence = '\n'.join([' '.join(sentence) for sentence in sentences])

In [12]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [13]:
# Tokenizer setup
tokenizer = Tokenizer()
tokenizer.fit_on_texts([final_sentence])

In [14]:
tokenizer.word_index

{'the': 1,
 'of': 2,
 'and': 3,
 'to': 4,
 'a': 5,
 'in': 6,
 'that': 7,
 'is': 8,
 'was': 9,
 'he': 10,
 'for': 11,
 "''": 12,
 'it': 13,
 'with': 14,
 'as': 15,
 'his': 16,
 'on': 17,
 'be': 18,
 'at': 19,
 'by': 20,
 'i': 21,
 'this': 22,
 'had': 23,
 'not': 24,
 'are': 25,
 'but': 26,
 'from': 27,
 'or': 28,
 'have': 29,
 'an': 30,
 'they': 31,
 'which': 32,
 'one': 33,
 'you': 34,
 'were': 35,
 'all': 36,
 'her': 37,
 'she': 38,
 'there': 39,
 'would': 40,
 'their': 41,
 'we': 42,
 'him': 43,
 'been': 44,
 'has': 45,
 'when': 46,
 'who': 47,
 'will': 48,
 'no': 49,
 'more': 50,
 'if': 51,
 'out': 52,
 'so': 53,
 'up': 54,
 'said': 55,
 'what': 56,
 'its': 57,
 'about': 58,
 'than': 59,
 'into': 60,
 'them': 61,
 'can': 62,
 'only': 63,
 'other': 64,
 'time': 65,
 'new': 66,
 'some': 67,
 'could': 68,
 'these': 69,
 'two': 70,
 'may': 71,
 'first': 72,
 'then': 73,
 'do': 74,
 'any': 75,
 'like': 76,
 'my': 77,
 'now': 78,
 'over': 79,
 'such': 80,
 'our': 81,
 'man': 82,
 'me': 83

In [15]:
len(tokenizer.word_index)

44541

In [16]:
# We'll find out all the sentences in the text.
for sentence in final_sentence.split('\n'):
    print(sentence)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
The children are eating , and Miss Blackwell's on her way somewheres '' .
`` To the graveyard .
Who ain't '' ? ?
`` Not me .
I've got a day's work to do .
-- You'll be visiting Miss Doaty , Ma'am '' ? ?
Henrietta nodded .
How much they knew about her ! !
The woman ( she must have been a tiny baby when Hetty and Delia had stood arm in arm , watching great age grow small ) answered the nod with her own .
`` God rest her soul , she was a sweet one .
Come on now '' .
She put a strong hand under the old man's arm and lifted him up , patiently , with the gentle cruelty and necessary tyranny that the young show toward the very old .
He mumbled at her but let himself be led off inside the house , shuffling mightily to make it clear how weak and aged he was and how he was buffeted about by those who still had their wicked strength .
There was a gabble of voices from indoors , young hungry sounds like cats after fish , and a burst 

In [17]:
# Convert text to sequences

input_sequences = []
for sentence in final_sentence.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1,len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])


In [18]:
input_sequences

[[1, 5514],
 [1, 5514, 658],
 [1, 5514, 658, 2189],
 [1, 5514, 658, 2189, 1652],
 [1, 5514, 658, 2189, 1652, 55],
 [1, 5514, 658, 2189, 1652, 55, 1901],
 [1, 5514, 658, 2189, 1652, 55, 1901, 30],
 [1, 5514, 658, 2189, 1652, 55, 1901, 30, 2220],
 [1, 5514, 658, 2189, 1652, 55, 1901, 30, 2220, 2],
 [1, 5514, 658, 2189, 1652, 55, 1901, 30, 2220, 2, 14185],
 [1, 5514, 658, 2189, 1652, 55, 1901, 30, 2220, 2, 14185, 569],
 [1, 5514, 658, 2189, 1652, 55, 1901, 30, 2220, 2, 14185, 569, 1131],
 [1, 5514, 658, 2189, 1652, 55, 1901, 30, 2220, 2, 14185, 569, 1131, 1411],
 [1,
  5514,
  658,
  2189,
  1652,
  55,
  1901,
  30,
  2220,
  2,
  14185,
  569,
  1131,
  1411,
  1217],
 [1,
  5514,
  658,
  2189,
  1652,
  55,
  1901,
  30,
  2220,
  2,
  14185,
  569,
  1131,
  1411,
  1217,
  49],
 [1,
  5514,
  658,
  2189,
  1652,
  55,
  1901,
  30,
  2220,
  2,
  14185,
  569,
  1131,
  1411,
  1217,
  49,
  491],
 [1,
  5514,
  658,
  2189,
  1652,
  55,
  1901,
  30,
  2220,
  2,
  14185,
  569,


In [19]:
max_len = max([len(x) for x in input_sequences])

In [20]:
print(max_len)

167


In [21]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding='pre')

In [22]:
padded_input_sequences

array([[    0,     0,     0, ...,     0,     1,  5514],
       [    0,     0,     0, ...,     1,  5514,   658],
       [    0,     0,     0, ...,  5514,   658,  2189],
       ...,
       [    0,     0,     0, ...,  2596, 44540,  1681],
       [    0,     0,     0, ..., 44540,  1681,     9],
       [    0,     0,     0, ...,  1681,     9, 44541]], dtype=int32)

In [23]:
X = padded_input_sequences[:,:-1]

In [24]:
y = padded_input_sequences[:,-1]

In [25]:
X.shape

(975933, 166)

In [26]:
X = X[:20000]

My laptop can't handle for data more than this.

In [27]:
print(X.shape)

(20000, 166)


In [28]:
y.shape

(975933,)

In [29]:
y = y[:20000]

In [30]:
print(y.shape)

(20000,)


In [31]:
from tensorflow.keras.utils import to_categorical
# Convert labels to one-hot encoding
num_classes = len(tokenizer.word_index) + 1
y = to_categorical(y, num_classes=num_classes)

In [32]:
print(y.shape)

(20000, 44542)


In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [34]:
# Build and compile the model
from tensorflow.keras.layers import Dropout
model = Sequential()
model.add(Embedding(num_classes, 100, input_length=max_len - 1))
model.add(LSTM(150))
model.add(Dense(num_classes, activation='softmax'))

# Build the model by providing a sample input
model.build(input_shape=(None, max_len))



In [35]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [36]:
model.summary()

In [37]:
model.fit(X, y, epochs=50)


Epoch 1/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m289s[0m 456ms/step - accuracy: 0.0612 - loss: 8.2730
Epoch 2/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m281s[0m 449ms/step - accuracy: 0.0756 - loss: 6.7041
Epoch 3/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 450ms/step - accuracy: 0.0934 - loss: 6.3561
Epoch 4/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m280s[0m 447ms/step - accuracy: 0.1084 - loss: 6.0871
Epoch 5/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 446ms/step - accuracy: 0.1187 - loss: 5.8380
Epoch 6/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 445ms/step - accuracy: 0.1267 - loss: 5.5902
Epoch 7/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 442ms/step - accuracy: 0.1402 - loss: 5.3557
Epoch 8/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 443ms/step - accuracy: 0.1551 - loss: 5.0775
Epoch 9/

<keras.src.callbacks.history.History at 0x7dfc6891f070>

In [38]:
# Save the model
model.save('my_model.h5')



In [39]:
# Save the tokenizer
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [2]:
# Load the model for future use
import numpy as np
from tensorflow.keras.models import load_model
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

model = load_model('my_model.h5')
# Load the word prediction tokenizer
with open(r'C:\Users\RohithSai\OneDrive - Indian Institute of Technology Guwahati\documents\PROJECTS\MAIN PROJECTS\OVERALL MACHINE TRANSLATION\tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
# Predict next word function remains the same
def predict_next_word(text):
    token_text = tokenizer.texts_to_sequences([text])[0]
    # padded_token_text = pad_sequences([token_text], maxlen=max_len - 1, padding='pre')
    padded_token_text = pad_sequences([token_text], maxlen=167 - 1, padding='pre')
    prediction = model.predict(padded_token_text)
    predicted_index = np.argmax(prediction)
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            return word

# Test the prediction
text = "hi i am so"
next_word = predict_next_word(text)
print(f"Next word prediction for '{text}': {next_word}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step
Next word prediction for 'hi i am so': that
