### Using NLP for Text Data Quality
**Objective**: Enhance text data quality using NLP techniques.

**Task**: Spelling Corrections

**Steps**:
1. Data Set: Import a dataset containing text reviews with spelling errors.
2. Apply Corrections: Use a spell-checker from an NLP library to correct spelling mistakes.
3. Verify Improvements: Review the corrections to ensure data quality improvement.

In [None]:
# write your code from here
import pandas as pd
import numpy as np
from spellchecker import SpellChecker # Import the SpellChecker class
import re

# Set random seed for reproducibility
np.random.seed(42)

# --- 1. Data Set: Generate Synthetic Text Reviews with Spelling Errors ---
def generate_reviews_with_errors(num_reviews=100):
    base_reviews = [
        "This prodact is amzing! I absolutly love it.",
        "The delivery was super fast and the item was as describd. Highly recomendd.",
        "It's okey, not gret, not terribble. Could be bettr.",
        "Disappointed with the qality. It brok after a week.",
        "Excelent costumer service and a grat user experience.",
        "Worst purchas ever. Complet wast of mony.",
        "Prety good for the priece. Would buy agian.",
        "Item recived damaged. Very unhappy.",
        "The softwear is bugy and crashes frequntly.",
        "A relible choise. Nothing fency, but gets the job don.",
        "Could use sum improvemnts. Too many adds.",
        "I expecd more. The battery life is horible.",
        "Fantastik! I'm so glad I bought this.",
        "Mediocre experienc. I've seen beter.",
        "Perfct fit and confortable.",
        "Lagging performence. Do not recomend.",
        "Grat value for mony. Will tell my frends.",
        "The instrucions were unclar. Hard to assamble.",
        "Solid bild quality. A bit hevy though.",
        "An absulute game-changer! Don't hesitete to buy.",
    ]
    reviews = []
    for i in range(num_reviews):
        review = np.random.choice(base_reviews)
        # Introduce some additional random errors or variations
        if np.random.rand() < 0.2:
            review = review.replace("the", "teh") # Common typo
        if np.random.rand() < 0.1:
            review = review.replace("is", "iz") # Slang/misspelling
        reviews.append(review)
    return pd.DataFrame({'review_id': range(num_reviews), 'review_text_original': reviews})

customer_reviews_df = generate_reviews_with_errors(num_reviews=200)
print("--- Sample of Original Reviews with Spelling Errors ---")
print(customer_reviews_df.head())

# --- 2. Apply Corrections: Use a Spell-Checker ---

# Initialize the spell checker
# It loads a default English dictionary.
spell = SpellChecker()

def correct_spelling(text):
    """
    Corrects spelling mistakes in a given text.
    Handles punctuation and converts to lowercase for consistency before correction.
    """
    # Convert to lowercase and split by whitespace (retaining basic structure)
    # This regex helps split words while keeping some punctuation attached if needed
    words = re.findall(r'\b\w+\b|[.,!?;]', text.lower()) # Capture words and common punctuation
    
    corrected_words = []
    for word in words:
        if re.match(r'\b\w+\b', word): # Check if it's a word (not just punctuation)
            corrected_word = spell.correction(word)
            # spell.correction() might return None if word is very strange, default to original
            if corrected_word is None:
                corrected_words.append(word)
            else:
                corrected_words.append(corrected_word)
        else: # It's punctuation, keep as is
            corrected_words.append(word)
    
    # Reconstruct the sentence, trying to handle spaces for punctuation
    corrected_text = ' '.join(corrected_words)
    # Adjust spacing around common punctuation
    corrected_text = re.sub(r'\s([.,!?;])', r'\1', corrected_text)
    
    return corrected_text

# Apply the spell correction function to the original reviews
customer_reviews_df['review_text_corrected'] = customer_reviews_df['review_text_original'].apply(correct_spelling)

print("\n--- Sample of Corrected Reviews ---")
print(customer_reviews_df[['review_text_original', 'review_text_corrected']].head())

# --- 3. Verify Improvements: Review Corrections ---
print("\n--- Verification: Original vs. Corrected (Random Samples) ---")
corrections_made_count = 0
total_words_processed = 0

for i in np.random.choice(customer_reviews_df.index, 5, replace=False):
    original_text = customer_reviews_df.loc[i, 'review_text_original']
    corrected_text = customer_reviews_df.loc[i, 'review_text_corrected']

    original_words = re.findall(r'\b\w+\b', original_text.lower())
    corrected_words = re.findall(r'\b\w+\b', corrected_text.lower())

    print(f"\nReview ID: {customer_reviews_df.loc[i, 'review_id']}")
    print(f"Original: {original_text}")
    print(f"Corrected: {corrected_text}")

    # Identify specific corrections for demonstration
    temp_corrections = {}
    for j, word_orig in enumerate(original_words):
        if j < len(corrected_words):
            word_corr = corrected_words[j]
            if word_orig != word_corr:
                temp_corrections[word_orig] = word_corr
                corrections_made_count += 1
        total_words_processed += 1

    if temp_corrections:
        print(f"  Specific corrections: {temp_corrections}")
    else:
        print("  No significant spelling changes in this sample.")
    print("-" * 70)
print(f"\nTotal approximate words processed across samples: {total_words_processed}")
print(f"Total approximate corrections made across samples: {corrections_made_count}")
original_lengths = customer_reviews_df['review_text_original'].apply(len)
corrected_lengths = customer_reviews_df['review_text_corrected'].apply(len)
print(f"\nAverage length of original reviews: {original_lengths.mean():.2f} characters")
print(f"Average length of corrected reviews: {corrected_lengths.mean():.2f} characters")


--- Sample of Original Reviews with Spelling Errors ---
   review_id                               review_text_original
0          0        Prety good for the priece. Would buy agian.
1          1                Item recived damaged. Very unhappy.
2          2             Solid bild quality. A bit hevy though.
3          3  Disappointed with teh qality. It brok after a ...
4          4  The delivery was super fast and the item was a...

--- Sample of Corrected Reviews ---
                                review_text_original  \
0        Prety good for the priece. Would buy agian.   
1                Item recived damaged. Very unhappy.   
2             Solid bild quality. A bit hevy though.   
3  Disappointed with teh qality. It brok after a ...   
4  The delivery was super fast and the item was a...   

                               review_text_corrected  
0        pretty good for the piece. would buy again.  
1               item received damaged. very unhappy.  
2           solid bui