### Using NLP for Text Data Quality
**Objective**: Enhance text data quality using NLP techniques.

**Task**: Removing Stopwords

**Steps**:
1. Data Set: Use a dataset of text product descriptions.
2. Stopword Removal: Utilize an NLP library (e.g., NLTK) to remove stopwords from the
descriptions.
3. Assess Impact: Examine the effectiveness by analyzing word frequency before and after
removal.

In [7]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from collections import Counter
import re

# --- STEP 1: CRITICAL - NLTK RESOURCE DOWNLOADS ---
# **Please run this section first, and verify the output.**
# **If you still get LookupError AFTER running this, RESTART YOUR PYTHON KERNEL/ENVIRONMENT and run the entire script again.**

print("--- NLTK Resource Download Check ---")
try:
    nltk.data.find('corpora/stopwords')
    print("'stopwords' corpus already downloaded.")
except LookupError:
    print("Downloading 'stopwords' corpus...")
    nltk.download('stopwords')
    print("'stopwords' download complete.")

try:
    nltk.data.find('tokenizers/punkt')
    print("'punkt' tokenizer data already downloaded.")
except LookupError:
    print("Downloading 'punkt' tokenizer data...")
    nltk.download('punkt') # This includes 'punkt_tab'
    print("'punkt' download complete.")

print("--- NLTK Resources Ready. Proceeding with script execution. ---")
print("If you had to download, please restart your kernel/environment and run again.")


np.random.seed(42)

# --- 1. Data Set: Generate Text Product Descriptions ---
def generate_product_descriptions(num_descriptions=100):
    templates = [
        "This is a high-quality product that offers excellent performance and durability. It is perfect for everyday use.",
        "A truly innovative device with a sleek design and powerful features. You will love using it.",
        "Our new item provides great value for your money. It is easy to use and very efficient.",
        "An essential tool for all your needs. It comes with a full warranty and dedicated customer support.",
        "The best solution for your home or office. It has many advanced functionalities and is very reliable.",
        "We have developed this especially for you. It simplifies tasks and improves productivity significantly.",
        "This wonderful gadget is designed to make your life easier and more enjoyable. It is available now.",
        "Experience the difference with our superior product. It delivers consistent results every time.",
        "A versatile and robust piece of equipment. It is built to last and withstand tough conditions.",
        "Ideal for professionals and enthusiasts alike. It combines precision with ease of operation."
    ]
    product_names = [
        "Smart Speaker", "Wireless Earbuds", "Portable Charger", "Ergonomic Mouse",
        "HD Camera", "Fitness Tracker", "Gaming Keyboard", "Robot Vacuum",
        "Electric Kettle", "Smartwatch"
    ]

    descriptions = []
    for i in range(num_descriptions):
        template = np.random.choice(templates)
        product = np.random.choice(product_names)
        desc = f"Introducing the {product}. {template}"
        if np.random.rand() < 0.2:
            desc = desc.replace("it is", "it's").replace("it has", "it's got")
        if np.random.rand() < 0.1:
            desc = desc.replace("This is", "This a")
        descriptions.append(desc)

    return pd.DataFrame({'product_id': range(num_descriptions), 'description_original': descriptions})

product_df = generate_product_descriptions(num_descriptions=150)
print("\n--- Sample of Original Product Descriptions ---")
print(product_df.head())

# --- 2. Stopword Removal: Utilize NLTK ---
# Load English stopwords AFTER ensuring they are downloaded
stop_words = set(stopwords.words('english'))

def clean_and_tokenize(text):
    """
    Cleans text by lowercasing, removing non-alphabetic characters,
    and then tokenizing using NLTK's word tokenizer.
    """
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove punctuation and numbers
    # This line below requires 'punkt' data to be loaded.
    # If this line fails, it means the 'punkt' data is still not found.
    tokens = nltk.word_tokenize(text)
    return tokens

def remove_stopwords_from_tokens(tokens):
    """
    Removes stopwords from a list of tokens.
    """
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

# Apply cleaning and tokenization
# This is the line where the error occurs if punkt data is missing.
product_df['description_tokens_original'] = product_df['description_original'].apply(clean_and_tokenize)
# Apply stopword removal
product_df['description_tokens_no_stopwords'] = product_df['description_tokens_original'].apply(remove_stopwords_from_tokens)
# Join tokens back into a string for display
product_df['description_cleaned'] = product_df['description_tokens_no_stopwords'].apply(lambda x: ' '.join(x))

print("\n--- Sample of Descriptions After Stopword Removal ---")
print(product_df[['description_original', 'description_cleaned']].head())

# --- 3. Assess Impact: Examine Word Frequency ---
# Collect all words before stopword removal
all_words_original = []
for tokens in product_df['description_tokens_original']:
    all_words_original.extend(tokens)

# Collect all words after stopword removal
all_words_no_stopwords = []
for tokens in product_df['description_tokens_no_stopwords']:
    all_words_no_stopwords.extend(tokens)

# Calculate word frequencies
word_freq_original = Counter(all_words_original)
word_freq_no_stopwords = Counter(all_words_no_stopwords)

print("\n--- Evaluation: Word Frequency Before and After Stopword Removal ---")
print("\n**Top 20 most frequent words (BEFORE stopword removal):**")
for word, count in word_freq_original.most_common(20):
    print(f"'{word}': {count}")

print("\n**Top 20 most frequent words (AFTER stopword removal):**")
for word, count in word_freq_no_stopwords.most_common(20):
    print(f"'{word}': {count}")

print("\n--- Example Comparison of a Single Description ---")
example_index = np.random.randint(0, len(product_df))
print(f"Original Description (ID: {product_df.loc[example_index, 'product_id']}):")
print(product_df.loc[example_index, 'description_original'])
print("\nCleaned Description (No Stopwords):")
print(product_df.loc[example_index, 'description_cleaned'])

--- NLTK Resource Download Check ---
'stopwords' corpus already downloaded.
'punkt' tokenizer data already downloaded.
--- NLTK Resources Ready. Proceeding with script execution. ---
If you had to download, please restart your kernel/environment and run again.

--- Sample of Original Product Descriptions ---
   product_id                               description_original
0           0  Introducing the Ergonomic Mouse. This wonderfu...
1           1  Introducing the Gaming Keyboard. The best solu...
2           2  Introducing the HD Camera. Experience the diff...
3           3  Introducing the HD Camera. We have developed t...
4           4  Introducing the Wireless Earbuds. We have deve...


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/vscode/nltk_data'
    - '/usr/local/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
import nltk

print("Attempting to download 'stopwords'...")
# This will download the stopwords corpus if it's not found
nltk.download('stopwords')
print("Stopwords download complete.")

print("\nAttempting to download 'punkt' tokenizer data...")
# This will download the punkt tokenizer data (including punkt_tab)
nltk.download('punkt')
print("Punkt tokenizer data download complete.")

print("\nNLTK resources check/download process finished. Check the output above for success messages (e.g., 'True').")