In [13]:
path = 'book.txt'
text = open(path).read().lower()
print('corpus length:', len(text))

# Split the text into individual words
words = text.split()
print('Number of words:', len(words))
print('First 10 words:', words[:10])

corpus length: 581888
Number of words: 107603
First 10 words: ['\ufeff', 'project', "gutenberg's", 'the', 'adventures', 'of', 'sherlock', 'holmes,', 'by', 'arthur']


In [14]:
from collections import Counter
from nltk import bigrams
from nltk.probability import FreqDist

# Count the frequency of each word
word_counts = Counter(words)
most_common_words = word_counts.most_common(10)  # Top 10 most frequent words
least_common_words = [word for word, count in word_counts.items() if count == 1]  # Words that appear only once

print('Most common words:', most_common_words)
print('Number of least common words:', len(least_common_words))

# Generate bigrams (pairs of consecutive words)
word_bigrams = list(bigrams(words))
bigram_counts = FreqDist(word_bigrams)
most_common_bigrams = bigram_counts.most_common(10)  # Top 10 most frequent bigrams

print('Most common bigrams:', most_common_bigrams)

Most common words: [('the', 5703), ('and', 2882), ('of', 2758), ('to', 2720), ('a', 2648), ('i', 2533), ('in', 1760), ('that', 1605), ('was', 1371), ('he', 1278)]
Number of least common words: 8576
Most common bigrams: [(('of', 'the'), 740), (('in', 'the'), 511), (('to', 'the'), 313), (('i', 'have'), 247), (('that', 'i'), 245), (('it', 'was'), 244), (('at', 'the'), 238), (('it', 'is'), 235), (('upon', 'the'), 196), (('and', 'the'), 193)]


In [15]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [17]:
sequence_length = 5  # Set the length of input sequences
sequences = []
next_words = []

# Generate sequences of 5 consecutive words and their next word
for i in range(len(words) - sequence_length):
    sequences.append(words[i:i + sequence_length])  # Input: X consecutive words
    next_words.append(words[i + sequence_length])  # Output: The next word

print(f'Number of sequences: {len(sequences)}')
print('First 5 sequences and their next words:')
for seq, next_word in zip(sequences[:5], next_words[:5]):
    print(f'Sequence: {seq} -> Next word: {next_word}')

Number of sequences: 107598
First 5 sequences and their next words:
Sequence: ['\ufeff', 'project', "gutenberg's", 'the', 'adventures'] -> Next word: of
Sequence: ['project', "gutenberg's", 'the', 'adventures', 'of'] -> Next word: sherlock
Sequence: ["gutenberg's", 'the', 'adventures', 'of', 'sherlock'] -> Next word: holmes,
Sequence: ['the', 'adventures', 'of', 'sherlock', 'holmes,'] -> Next word: by
Sequence: ['adventures', 'of', 'sherlock', 'holmes,', 'by'] -> Next word: arthur


In [18]:
# Create a vocabulary of unique words and add <UNK> for unknown words
vocabulary = sorted(set(words))
vocabulary.append('<UNK>')  # Add <UNK> token
vocab_size = len(vocabulary)
word_to_index = {word: i for i, word in enumerate(vocabulary)}
index_to_word = {i: word for i, word in enumerate(vocabulary)}

# Encode sequences and next words as indices
X = []
y = []

for i in range(len(words) - sequence_length):
    sequence = words[i:i + sequence_length]
    next_word = words[i + sequence_length]
    X.append([word_to_index[word] for word in sequence])
    y.append(word_to_index[next_word])

X = np.array(X)
y = np.array(y)

# One-hot encode the output
y = to_categorical(y, num_classes=vocab_size)

# Split the data into training, testing, and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [23]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Create a vocabulary of unique words and add <UNK> for unknown words
vocabulary = sorted(set(words))
vocabulary.append('<UNK>')  # Add <UNK> token
vocab_size = len(vocabulary)
word_to_index = {word: i for i, word in enumerate(vocabulary)}
index_to_word = {i: word for i, word in enumerate(vocabulary)}

# Encode sequences and next words as indices
X = []
y = []

for i in range(len(words) - sequence_length):
    sequence = words[i:i + sequence_length]
    next_word = words[i + sequence_length]
    X.append([word_to_index[word] for word in sequence])
    y.append(word_to_index[next_word])

X = np.array(X)
y = np.array(y)

# One-hot encode the output
y = to_categorical(y, num_classes=vocab_size)

# Split the data into training, testing, and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
# Build a simple neural network model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=sequence_length),  # Embedding layer
    LSTM(128, return_sequences=False),  # LSTM layer
    Dense(128, activation='relu'),  # Fully connected layer
    Dense(vocab_size, activation='softmax')  # Output layer
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,  # Start with 10 epochs; you can increase this later
    batch_size=64
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

Epoch 1/10
[1m1345/1345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 32ms/step - accuracy: 0.0522 - loss: 7.4231 - val_accuracy: 0.0529 - val_loss: 6.9141
Epoch 2/10
[1m1345/1345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 31ms/step - accuracy: 0.0653 - loss: 6.6027 - val_accuracy: 0.0980 - val_loss: 6.6850
Epoch 3/10
[1m1345/1345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 31ms/step - accuracy: 0.1056 - loss: 6.0480 - val_accuracy: 0.1100 - val_loss: 6.7165
Epoch 4/10
[1m1345/1345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 61ms/step - accuracy: 0.1293 - loss: 5.6634 - val_accuracy: 0.1160 - val_loss: 6.7891
Epoch 5/10
[1m1345/1345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 37ms/step - accuracy: 0.1516 - loss: 5.3177 - val_accuracy: 0.1231 - val_loss: 6.9625
Epoch 6/10
[1m1345/1345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 37ms/step - accuracy: 0.1773 - loss: 4.9957 - val_accuracy: 0.1223 - val_loss: 7.3279
Epoc

In [26]:
def predict_next_word(model, input_text, word_to_index, index_to_word, sequence_length):
    """
    Predict the next word given an input sentence of X words.
    Handles unknown words by replacing them with <UNK>.
    """
    # Preprocess the input text
    input_words = input_text.lower().split()
    if len(input_words) != sequence_length:
        raise ValueError(f"Input must contain exactly {sequence_length} words.")
    
    # Replace unknown words with <UNK>
    input_indices = [
        word_to_index[word] if word in word_to_index else word_to_index['<UNK>']
        for word in input_words
    ]
    
    # Reshape input for prediction
    input_array = np.array(input_indices).reshape(1, -1)
    
    # Predict the next word
    predictions = model.predict(input_array, verbose=0)
    predicted_index = np.argmax(predictions)
    predicted_word = index_to_word[predicted_index]
    
    return predicted_word

# Example usage
custom_input = "the quick brown fox jumps"  # Replace with your own 5-word sentence
try:
    next_word = predict_next_word(model, custom_input, word_to_index, index_to_word, sequence_length)
    print(f"Input: '{custom_input}'")
    print(f"Predicted next word: '{next_word}'")
except ValueError as e:
    print(e)

Input: 'the quick brown fox jumps'
Predicted next word: '“did'
