<a href="https://colab.research.google.com/github/Sana-Izumi/Natural_Language_Processing/blob/main/Final%20Project/English_Spell_Checker_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**English Spell Checker chatbot**

---

The input word is correctly spelled, chatbot will return "Nothing Wrong!". The input word is misspelled, chatbot will suggest the similar word which seems original.

(error_rate = 0.1, epochs = 10, batch_size = 64)

In [2]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import words
import random

# Download the dataset if not already done
nltk.download('words')

# Load the dataset
word_list = words.words()

# Function to introduce misspellings
def introduce_misspellings(word, error_rate=0.1):
    if random.random() < error_rate:
        index = random.randint(0, len(word) - 1)
        return word[:index] + random.choice('abcdefghijklmnopqrstuvwxyz') + word[index + 1:]
    return word

# Create a dataset with misspellings
misspelled_word_list = [introduce_misspellings(word) for word in word_list]

# Combine correct and misspelled words
combined_word_list = word_list + misspelled_word_list
labels = [0] * len(word_list) + [1] * len(misspelled_word_list)

# Tokenize the words
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(combined_word_list)
X_seq = tokenizer.texts_to_sequences(combined_word_list)
max_seq_length = max(len(seq) for seq in X_seq)
X_seq = pad_sequences(X_seq, maxlen=max_seq_length)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_seq, labels, test_size=0.2, random_state=42)

# Convert labels to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

# Build LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=max_seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy}")

# Predict and evaluate
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.5152648091316223
Accuracy: 0.5152647975077882
              precision    recall  f1-score   support

           0       0.51      0.76      0.61     47305
           1       0.53      0.27      0.36     47390

    accuracy                           0.52     94695
   macro avg       0.52      0.52      0.48     94695
weighted avg       0.52      0.52      0.48     94695



In [None]:
#save the model
filename = 'chatbot_errorRate10_epochs10_batch64.h5'
model.save(filename)

# 1. Load the libraries and dataset

In [16]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import words
import random

# Download the dataset if not already done
nltk.download('words')

# Load the dataset
word_list = words.words()


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


# 2. Create a Corpus with Misspelled Words

Making the dataset mixed words correctly spelling and misspelling

In [17]:
# Function to introduce misspellings
def introduce_misspellings(word, error_rate=0.8):
    if random.random() < error_rate:
        index = random.randint(0, len(word) - 1)
        return word[:index] + random.choice('abcdefghijklmnopqrstuvwxyz') + word[index + 1:]
    return word

# Create a dataset with misspellings
misspelled_word_list = [introduce_misspellings(word) for word in word_list]

# Combine correct and misspelled words
combined_word_list = word_list + misspelled_word_list
labels = [0] * len(word_list) + [1] * len(misspelled_word_list)



# 3. Implementing LSTM-based neural network using Keras

1. Preprocessing Data: Tokenize the words and convert them into sequences.
2. Padding Sequences: Neural networks require input sequences to be of the same length.
3. Building the LSTM Model: Build a sequential model with an embedding layer, LSTM layers, and a dense output layer.
4. Training the Model: Fit the model on the training data.
5. Evaluating the Model: Evaluate the model's performance on the test data.


In [18]:
# Tokenize the words
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(combined_word_list)
X_seq = tokenizer.texts_to_sequences(combined_word_list)
max_seq_length = max(len(seq) for seq in X_seq)
X_seq = pad_sequences(X_seq, maxlen=max_seq_length)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_seq, labels, test_size=0.2, random_state=42)

# Convert labels to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

# Build LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=max_seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy}")

# Predict and evaluate
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.7451396584510803
Accuracy: 0.7451396589049052
              precision    recall  f1-score   support

           0       0.69      0.89      0.78     47305
           1       0.84      0.60      0.70     47390

    accuracy                           0.75     94695
   macro avg       0.77      0.75      0.74     94695
weighted avg       0.77      0.75      0.74     94695



In [None]:
#save the model
filename = 'chatbot_errorRate80_epochs10_batch64.h5'
model.save(filename)



---



(error_rate = 0.5, epochs = 10, batch_size = 32)

In [12]:
# Tokenize the words
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(combined_word_list)
X_seq = tokenizer.texts_to_sequences(combined_word_list)
max_seq_length = max(len(seq) for seq in X_seq)
X_seq = pad_sequences(X_seq, maxlen=max_seq_length)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_seq, labels, test_size=0.2, random_state=42)

# Convert labels to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

# Build LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=max_seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy}")

# Predict and evaluate
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.6407941579818726
Accuracy: 0.6407941285178732
              precision    recall  f1-score   support

           0       0.59      0.91      0.72     47305
           1       0.81      0.37      0.51     47390

    accuracy                           0.64     94695
   macro avg       0.70      0.64      0.61     94695
weighted avg       0.70      0.64      0.61     94695



In [None]:
#save the model
filename = 'chatbot_errorRate50_epochs10_batch32.h5'
model.save(filename)



---



**Chatbot function to correct spelling**

Enter a word. Type 'exit' to quit.

In [19]:
# Chatbot function to correct spelling
def correct_spelling(input_word):
    input_seq = tokenizer.texts_to_sequences([input_word])
    input_seq = pad_sequences(input_seq, maxlen=max_seq_length)

    prediction = model.predict(input_seq)

    if prediction < 0.5:
        return "Nothing Wrong!"

    # Find the most similar word
    similarities = []
    for word in word_list:
        if len(word) == len(input_word):
            sim = sum(1 for a, b in zip(word, input_word) if a == b)
            similarities.append((word, sim))
    if not similarities:
        return "No suggestions available."

    best_match = max(similarities, key=lambda x: x[1])[0]
    accuracy = max(similarities, key=lambda x: x[1])[1] / len(input_word) * 100

    return f"Suggested Correction: {best_match}, Matching Accuracy: {accuracy:.2f}%"

# Simple chatbot interface
def chatbot():
    print("Welcome to the English Correcting Chatbot! Type 'exit' to quit.")
    while True:
        user_input = input("Enter a word: ").strip()
        if user_input.lower() == 'exit':
            break
        correction = correct_spelling(user_input)
        print(correction)

# Run the chatbot
chatbot()

Welcome to the English Correcting Chatbot! Type 'exit' to quit.
Enter a word: hallo
Nothing Wrong!
Enter a word: heplo
Nothing Wrong!
Enter a word: herlo
Nothing Wrong!
Enter a word: pythrn
Suggested Correction: python, Matching Accuracy: 83.33%
Enter a word: wordd
Nothing Wrong!
Enter a word: wolrd
Nothing Wrong!
Enter a word: vread
Suggested Correction: aread, Matching Accuracy: 80.00%
Enter a word: ahind
Suggested Correction: ahind, Matching Accuracy: 100.00%
Enter a word: eixt
Suggested Correction: aint, Matching Accuracy: 50.00%
Enter a word: nomm
Suggested Correction: noma, Matching Accuracy: 75.00%
Enter a word: jupan
Nothing Wrong!
Enter a word: wrod
Suggested Correction: brod, Matching Accuracy: 75.00%
Enter a word: vidoo
Suggested Correction: video, Matching Accuracy: 80.00%
Enter a word: exit
