In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from difflib import get_close_matches

# Load dataset from CSV
data = pd.read_csv("dataset.csv")

# Data Cleaning: Remove any rows with missing values
data = data.dropna(subset=["Mispelled", "Correct"])

ModuleNotFoundError: No module named 'pandas'

In [3]:
data

NameError: name 'data' is not defined

In [3]:
data.head()

Unnamed: 0,Mispelled,Correct
0,clanberry,cranberry
1,persimmmmon,persimmon
2,elterberry,elderberry
3,orangt,orange
4,blueqerry,blueberry


In [4]:
# Data Preprocessing: Convert text to lowercase and remove special characters
data["Mispelled"] = data["Mispelled"].apply(lambda x: re.sub(r"[^a-zA-Z0-9\s]", "", x.lower()))
data["Correct"] = data["Correct"].apply(lambda x: re.sub(r"[^a-zA-Z0-9\s]", "", x.lower()))

# Preprocess dataset and split into features (X) and labels (y)
X = data["Mispelled"]
y = data["Correct"]

# Use LabelEncoder to convert text data to numeric labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Fit only once on the entire target data

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create a bag-of-words vectorizer
vectorizer = CountVectorizer()

# Transform the training data into numerical features
X_train_vectorized = vectorizer.fit_transform(X_train.astype(str))

# Transform the test data using the same vectorizer
X_test_vectorized = vectorizer.transform(X_test.astype(str))

In [5]:
#Naive Bayes model
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

y_pred = model.predict(X_test_vectorized)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 77.53%


In [6]:
#SVC
from sklearn.svm import SVC
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_vectorized, y_train)
y_pred_svm = svm_model.predict(X_test_vectorized)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Model Accuracy: {accuracy_svm * 100:.2f}%")


SVM Model Accuracy: 77.81%


In [7]:
from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vectorized, y_train)

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test_vectorized)

# Calculate accuracy for Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Model Accuracy: {accuracy_rf * 100:.2f}%")


Random Forest Model Accuracy: 77.40%


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from difflib import get_close_matches
import numpy as np

# Load dataset from CSV (replace 'synthetic_autocorrect_dataset.csv' with your dataset)
data = pd.read_csv("dataset.csv")

# Preprocess dataset and split into features (X) and labels (y)
X = data["Mispelled"]
y = data["Correct"]

# Create a bag-of-words vectorizer
vectorizer = CountVectorizer()

# Transform the data into numerical features
X_vectorized = vectorizer.fit_transform(X.astype(str))

# Train the SVM model
model = SVC()
model.fit(X_vectorized, y)

# Autocorrect function using pre-trained model and vectorizer
def autocorrect(input_word, correct_words, model, vectorizer, max_suggestions=1):
    input_word_vectorized = vectorizer.transform(np.array([input_word]))  # Reshape to 2D array
    if not input_word_vectorized.getnnz():
        return input_word, ["No suggestions available."]

    input_word_str = vectorizer.inverse_transform(input_word_vectorized)[0][0]  # Convert back to string
    input_word_lower = input_word_str.lower()

    if input_word_lower in [word.lower() for word in correct_words]:
        return input_word_str, ["Word is already correct."]

    # Get close matches based on edit distance with the correct words
    close_matches = get_close_matches(input_word_lower, [word.lower() for word in correct_words], n=max_suggestions, cutoff=0.75)

    if not close_matches:
        # If no close matches found, use the SVM model to autocorrect the word
        corrected_word = model.predict(input_word_vectorized)[0]
        return input_word_str, [corrected_word]

    return input_word_str, close_matches[:max_suggestions]

# Test the autocorrect function interactively
while True:
    user_input = input("Enter a word to autocorrect (or 'q' to quit): ")
    if user_input.lower() == 'q':
        break

    result, suggestions = autocorrect(user_input, data["Correct"].tolist(), model, vectorizer)

    print(f"Input Word: {user_input}")
    print(f"Autocorrected Word: {suggestions}")



Enter a word to autocorrect (or 'q' to quit): grap
Input Word: grap
Autocorrected Word: grap
Suggestions: ['grape', 'grape', 'grape']


In [None]:
# import random
# import pandas as pd
# import string

# # Function to introduce random typos in a word
# def introduce_typos(word):
#     if len(word) <= 1:
#         return word

#     typo_options = ["insert", "delete", "replace"]
#     typo_choice = random.choice(typo_options)

#     if typo_choice == "insert":
#         return word[:random.randint(0, len(word))] + random.choice(string.ascii_lowercase) + word[random.randint(0, len(word)):]

#     elif typo_choice == "delete":
#         idx = random.randint(0, len(word) - 1)
#         return word[:idx] + word[idx + 1:]

#     else:  # typo_choice == "replace"
#         idx = random.randint(0, len(word) - 1)
#         return word[:idx] + random.choice(string.ascii_lowercase) + word[idx + 1:]

# # Generate synthetic autocorrect dataset
# num_entries = 100000
# correct_words = ["apple", "banana", "orange", "grape", "cherry", "peach", "pear", "watermelon", "strawberry", "blueberry",
#                  "mango", "pineapple", "kiwi", "plum", "apricot", "pomegranate", "raspberry", "blackberry", "melon",
#                  "lemon", "lime", "coconut", "avocado", "guava", "fig", "nectarine", "papaya", "cantaloupe", "cranberry",
#                  "date", "dragonfruit", "elderberry", "gooseberry", "kiwifruit", "mangosteen", "passionfruit", "persimmon",
#                  "rhubarb", "tangerine", "boysenberry", "currant", "mulberry", "loganberry", "starfruit", "grapefruit",]

# misspelled_words = []
# corrected_words = []

# for _ in range(num_entries):
#     correct_word = random.choice(correct_words)
#     misspelled_word = introduce_typos(correct_word)
#     misspelled_words.append(misspelled_word)
#     corrected_words.append(correct_word)

# # Create a DataFrame for the dataset
# synthetic_dataset = pd.DataFrame({"Mispelled": misspelled_words, "Correct": corrected_words})

# # Save the dataset to a CSV file
# synthetic_dataset.to_csv("dataset.csv", index=False)


In [5]:
from joblib import dump
dump(model,'model.joblib')

NameError: name 'model' is not defined