# **Predicting the Next Word**

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Sample data
corpus = [
    "I love programming in Python",
    "Python is a great programming language",
    "I enjoy learning new programming languages",
    "Programming is fun and exciting",
    "I love to solve problems using Python",
    "I am enjoying",
    "Can't believe we came so far"
]

# Convert to lowercase for consistency
corpus = [line.lower() for line in corpus]

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

# Pad sequences
max_sequence_length = max(len(x) for x in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

# Create predictors and labels
X, y = input_sequences[:, :-1], input_sequences[:, -1]  # No need for one-hot encoding

# Build the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length - 1))
model.add(LSTM(150))
model.add(Dropout(0.2))  # Add dropout to reduce overfitting
model.add(Dense(total_words, activation='softmax'))

# Compile the model with sparse categorical cross-entropy
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=100, verbose=1)

# Function to predict the next word
def predict_next_word(text):
    token_list = tokenizer.texts_to_sequences([text.lower()])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    return tokenizer.index_word[np.argmax(predicted)]

# Example usage
input_text = "I love to"
next_word = predict_next_word(input_text)
print(f"The next word prediction for '{input_text}' is: '{next_word}'")




Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0323 - loss: 3.3662
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - accuracy: 0.0968 - loss: 3.3607
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step - accuracy: 0.0968 - loss: 3.3528
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step - accuracy: 0.1613 - loss: 3.3446
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.1290 - loss: 3.3359
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - accuracy: 0.1613 - loss: 3.3275
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step - accuracy: 0.1613 - loss: 3.3192
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - accuracy: 0.1290 - loss: 3.3052
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Function to preprocess text (lowercase, strip whitespace, remove special characters)
def preprocess_text(corpus):
    corpus = [line.lower().strip() for line in corpus]
    return [line for line in corpus if line]  # Remove empty lines

# Load and preprocess the text file for corpus
with open("/content/1661-0.txt", 'r', encoding='utf-8') as file:
    corpus = file.readlines()

corpus = preprocess_text(corpus)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

# Pad sequences
max_sequence_length = max(len(x) for x in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

# Create predictors and labels
X, y = input_sequences[:, :-1], input_sequences[:, -1]  # No need for one-hot encoding

# Build the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length - 1))
model.add(LSTM(150))
model.add(Dropout(0.2))  # Add dropout to reduce overfitting
model.add(Dense(total_words, activation='softmax'))

# Compile the model with sparse categorical cross-entropy
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=5, verbose=1)

# Function to predict the next word
def predict_next_word(text):
    token_list = tokenizer.texts_to_sequences([text.lower()])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    return tokenizer.index_word[np.argmax(predicted)]

print(" ")

# Example usage
input_text = "I can’t"
next_word = predict_next_word(input_text)
print(f"The next word prediction for '{input_text}' is: '{next_word}'")

input_text = "she is"
next_word = predict_next_word(input_text)
print(f"The next word prediction for '{input_text}' is: '{next_word}'")


Epoch 1/5
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m292s[0m 91ms/step - accuracy: 0.0576 - loss: 6.5849
Epoch 2/5
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m359s[0m 113ms/step - accuracy: 0.1113 - loss: 5.6877
Epoch 3/5
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m333s[0m 105ms/step - accuracy: 0.1441 - loss: 5.2851
Epoch 4/5
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m309s[0m 97ms/step - accuracy: 0.1622 - loss: 4.9807
Epoch 5/5
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m300s[0m 90ms/step - accuracy: 0.1791 - loss: 4.7050
 
The next word prediction for 'I can’t' is: 'have'
The next word prediction for 'she is' is: 'a'


# **Correcting the Wrong Spelling**

In [17]:
from spellchecker import SpellChecker

# Initialize the spell checker
spell = SpellChecker()

# Function to perform autocorrect on user input
def autocorrect(text_input):
    words = text_input.split()

    # Find misspelled words
    misspelled_words = spell.unknown(words)

    # Prepare the output with corrections
    corrected_text = ""
    for word in words:
        if word in misspelled_words:
            # Replace the misspelled word with the best correction
            corrected_word = spell.correction(word)
            corrected_text += corrected_word + " "
        else:
            corrected_text += word + " "

    return corrected_text

# User input
text_input = input("Enter text: ")

# Autocorrect the input
corrected_text = autocorrect(text_input)

# Display the corrected text
print("\nCorrected text:")
print(corrected_text)


Enter text: wronh speling

Corrected text:
wrong spelling 
