In [None]:
pip install textdistance

Collecting textdistance
  Downloading textdistance-4.5.0-py3-none-any.whl (31 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.5.0
[0mNote: you may need to restart the kernel to use updated packages.


# **Natural Language Processing Project**
Raghad Bahashwan

Wesal Alkhateeb

Nojood Alnahdi

Reema Talal

# **Import Libraries**

In [None]:
import os
import re
import pandas as pd
import numpy as np
from collections import Counter
from itertools import product
from textdistance import DamerauLevenshtein
from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# **Preprocessing the Dataset**
Involves loading and cleaning a dataset of Arabic text. The text is preprocessed by removing punctuation and converting it to lowercase, which ensures consistency and prepares the data for further analysis. The preprocessed text is then saved to a CSV file for easy access.

In [None]:
# Convert text to a pandas DataFrame and save it to a CSV file
def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text
dir_path = '/kaggle/input/sanad-dataset/Culture'

all_text = ''

for filename in os.listdir(dir_path):
    if filename.endswith('.txt'):
        with open(os.path.join(dir_path, filename), 'r', encoding='utf-8') as f:
            text = f.read()
            all_text += text

cleaned_text = preprocess(all_text)

df = pd.DataFrame({'text': [cleaned_text]})
df.to_csv('preprocessed_text.csv', index=False)
df

Unnamed: 0,text
0,أبوظبي آلاء عبد الغني تناولت الجلسة الثانية في...


# **Spell Checking and Suggestions**

1. **Building the Dictionary and Tokenization:**
   - A frequency dictionary is generated from the preprocessed dataset, which contains all unique words along with their frequency of occurrence. Before counting, the input sentence is tokenized into individual words using a tokenizer that handles Arabic text. This combined process ensures that the dictionary accurately represents the frequency of each word, facilitating the ranking of potential corrections for misspelled words.


2. **Error Detection and Correction:**

 - For each word in the input sentence, the Damerau-Levenshtein distance algorithm is applied to detect potential spelling errors. This algorithm computes the minimum number of single-character edits (insertions, deletions, substitutions, or transpositions) needed to change the misspelled word into a valid word from the dictionary.

In [None]:

# Load the dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path, encoding='utf-8')
    return df

# Create a dictionary from the dataset ( word frequency )
def create_dictionary(dataframe):
    words = []
    for index, row in dataframe.iterrows():
        words.extend(word_tokenize(row[0]))
    word_counts = Counter(words)
    return word_counts

# Calculate Damerau-Levenshtein distance ( distance, operations)
def dl_distance(word1, word2):
    dl = DamerauLevenshtein()
    return dl.distance(word1, word2)

# Find the closest words in the dictionary to the misspelled word
def find_closest_words(misspelled_word, dictionary, max_distance):
    closest_words = []
    for word, count in dictionary.items():
        distance = dl_distance(misspelled_word, word)
        if distance <= max_distance:
            closest_words.append((word, count, distance))
    closest_words.sort(key=lambda x: (x[2], -x[1]))
    return closest_words



3. **Suggestion Generation and Providing Correction Suggestions:**

After detecting potential misspellings, the system generates correction suggestions for each misspelled word. The suggestions are ranked based on:

Distance: The similarity between the misspelled word and the suggested word, measured by the number of character edits.

Frequency: The occurrence frequency of the suggested word in the dataset, with higher priority given to more commonly used words.


- The final output is a list of ranked suggestions for each misspelled word, which can be presented to users for selection or automatic correction.

In [None]:
if __name__ == "__main__":

    file_path = "/kaggle/input/culture-nlp/preprocessed_text.csv"
    df = load_dataset(file_path)
    dictionary = create_dictionary(df)
    # Set the maximum Damerau-Levenshtein distance to consider a word as a suggestion
    max_distance = 1

# **EXAMPLE 1**

In [None]:

sentence = "السللام علبكم مرخبا اهلاا كسف الهال"
words = nltk.word_tokenize(sentence)
for misspelled_word in words:
    closest_words = find_closest_words(misspelled_word, dictionary, max_distance)
    print("Suggestions for '{}':".format(misspelled_word))
    # Iterate over each closest word found
    for word, count, distance in closest_words:
        print("  - {}: frequency={}, distance={}".format(word, count, distance))
    print()

Suggestions for 'السللام':
  - السلام: frequency=174, distance=1
  - السلالم: frequency=2, distance=1

Suggestions for 'علبكم':
  - عليكم: frequency=11, distance=1
  - البكم: frequency=1, distance=1

Suggestions for 'مرخبا':
  - مرحبا: frequency=17, distance=1
  - مركبا: frequency=5, distance=1
  - مرعبا: frequency=1, distance=1
  - مرتبا: frequency=1, distance=1
  - مخربا: frequency=1, distance=1

Suggestions for 'اهلاا':
  - اهلا: frequency=1, distance=1
  - اهلال: frequency=1, distance=1

Suggestions for 'كسف':
  - كيف: frequency=715, distance=1
  - كشف: frequency=75, distance=1
  - كسر: frequency=56, distance=1
  - كاف: frequency=29, distance=1
  - كسب: frequency=22, distance=1
  - كنف: frequency=19, distance=1
  - كلف: frequency=13, distance=1
  - كف: frequency=10, distance=1
  - كتف: frequency=9, distance=1
  - نسف: frequency=4, distance=1

Suggestions for 'الهال':
  - الحال: frequency=192, distance=1
  - المال: frequency=93, distance=1
  - الهول: frequency=65, distance=1
  - اله

# **EXAMPLE 2**

In [None]:
sentence = "هاذا مسروع مادةة نعالجة لعات كبيعية"
words = nltk.word_tokenize(sentence)

for misspelled_word in words:

    closest_words = find_closest_words(misspelled_word, dictionary, max_distance)
    print("Suggestions for '{}':".format(misspelled_word))

    # Iterate over each closest word found
    for word, count, distance in closest_words:
        print("  - {}: frequency={}, distance={}".format(word, count, distance))
    print()

Suggestions for 'هاذا':
  - هذا: frequency=7787, distance=1
  - ماذا: frequency=202, distance=1
  - هكذا: frequency=130, distance=1
  - اذا: frequency=21, distance=1
  - واذا: frequency=9, distance=1
  - هانا: frequency=5, distance=1
  - هاما: frequency=4, distance=1
  - هوذا: frequency=1, distance=1
  - هازا: frequency=1, distance=1
  - فاذا: frequency=1, distance=1

Suggestions for 'مسروع':
  - مشروع: frequency=775, distance=1
  - مسموع: frequency=4, distance=1
  - مروع: frequency=3, distance=1
  - مسرور: frequency=2, distance=1
  - مزروع: frequency=2, distance=1

Suggestions for 'مادةة':
  - مادة: frequency=202, distance=1
  - مادية: frequency=51, distance=1

Suggestions for 'نعالجة':
  - معالجة: frequency=85, distance=1
  - نعالج: frequency=1, distance=1

Suggestions for 'لعات':
  - لغات: frequency=175, distance=1
  - لعام: frequency=159, distance=1
  - لعبت: frequency=56, distance=1
  - لذات: frequency=7, distance=1
  - لعزت: frequency=1, distance=1
  - عات: frequency=1, distance=

# **EXAMPLE 3**

In [None]:
sentence = "الصصباح لدبنا مناقسة مثروع التهرج"
words = nltk.word_tokenize(sentence)

for misspelled_word in words:

    closest_words = find_closest_words(misspelled_word, dictionary, max_distance)
    print("Suggestions for '{}':".format(misspelled_word))

    # Iterate over each closest word found
    for word, count, distance in closest_words:
        print("  - {}: frequency={}, distance={}".format(word, count, distance))
    print()

Suggestions for 'الصصباح':
  - الصباح: frequency=84, distance=1
  - المصباح: frequency=4, distance=1

Suggestions for 'لدبنا':
  - لدينا: frequency=170, distance=1
  - أدبنا: frequency=12, distance=1
  - لأدبنا: frequency=3, distance=1
  - لعبنا: frequency=1, distance=1

Suggestions for 'مناقسة':
  - مناقشة: frequency=107, distance=1
  - منافسة: frequency=35, distance=1
  - مناقضة: frequency=3, distance=1
  - مناقلة: frequency=1, distance=1

Suggestions for 'مثروع':
  - مشروع: frequency=775, distance=1
  - مروع: frequency=3, distance=1
  - مزروع: frequency=2, distance=1

Suggestions for 'التهرج':
  - المهرج: frequency=28, distance=1
  - التخرج: frequency=15, distance=1
  - التدرج: frequency=6, distance=1
  - التهريج: frequency=4, distance=1
  - التهرب: frequency=2, distance=1
  - التحرج: frequency=1, distance=1
  - الهرج: frequency=1, distance=1



# **EXAMPLE 4**

In [None]:
sentence = "إته اثبوع يختوي علىى الكقير نن الفغاليات"
words = nltk.word_tokenize(sentence)

for misspelled_word in words:

    closest_words = find_closest_words(misspelled_word, dictionary, max_distance)
    print("Suggestions for '{}':".format(misspelled_word))

    # Iterate over each closest word found
    for word, count, distance in closest_words:
        print("  - {}: frequency={}, distance={}".format(word, count, distance))
    print()

Suggestions for 'إته':
  - إنه: frequency=596, distance=1
  - إله: frequency=6, distance=1
  - إيه: frequency=5, distance=1
  - إتش: frequency=2, distance=1
  - إت: frequency=1, distance=1
  - مته: frequency=1, distance=1
  - إنته: frequency=1, distance=1

Suggestions for 'اثبوع':
  - اسبوع: frequency=4, distance=1

Suggestions for 'يختوي':
  - يحتوي: frequency=73, distance=1
  - يختفي: frequency=11, distance=1
  - يستوي: frequency=7, distance=1
  - يرتوي: frequency=1, distance=1
  - ينتوي: frequency=1, distance=1

Suggestions for 'علىى':
  - على: frequency=29764, distance=1

Suggestions for 'الكقير':
  - الكثير: frequency=1440, distance=1
  - الكبير: frequency=1013, distance=1
  - الفقير: frequency=30, distance=1

Suggestions for 'نن':
  - من: frequency=60152, distance=1
  - أن: frequency=20854, distance=1
  - عن: frequency=14172, distance=1
  - بن: frequency=5308, distance=1
  - إن: frequency=3619, distance=1
  - ان: frequency=1103, distance=1
  - فن: frequency=754, distance=1
  - نح