### Using NLP for Text Data Quality
**Objective**: Enhance text data quality using NLP techniques.

**Task**: Removing Stopwords

**Steps**:
1. Data Set: Use a dataset of text product descriptions.
2. Stopword Removal: Utilize an NLP library (e.g., NLTK) to remove stopwords from the
descriptions.
3. Assess Impact: Examine the effectiveness by analyzing word frequency before and after
removal.

In [3]:
# write your code from here
import pandas as pd
import nltk
from nltk.corpus import stopwords
from collections import Counter
import matplotlib.pyplot as plt

# Step 1: Sample product descriptions
data = {
    'ProductID': [101, 102, 103],
    'Description': [
        "This is a high-quality leather wallet with multiple compartments.",
        "Elegant stainless steel wristwatch suitable for formal occasions.",
        "Comfortable cotton t-shirt available in various sizes and colors."
    ]
}

df = pd.DataFrame(data)

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Tokenize and count word frequencies function
def get_word_freq(text):
    words = text.lower().split()
    return Counter(words)

# Step 2: Remove stopwords
def remove_stopwords(text):
    words = text.lower().split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# Word frequency before stopword removal
all_text_before = ' '.join(df['Description'])
freq_before = get_word_freq(all_text_before)

# Remove stopwords from descriptions
df['CleanedDescription'] = df['Description'].apply(remove_stopwords)

# Word frequency after stopword removal
all_text_after = ' '.join(df['CleanedDescription'])
freq_after = get_word_freq(all_text_after)

# Step 3: Compare top 10 words before and after
print("Top 10 words BEFORE stopword removal:")
print(freq_before.most_common(10))

print("\nTop 10 words AFTER stopword removal:")
print(freq_after.most_common(10))

# Optional: Plot word frequency comparison
def plot_word_freq(freq_before, freq_after):
    words_before, counts_before = zip(*freq_before.most_common(10))
    words_after, counts_after = zip(*freq_after.most_common(10))
    
    fig, axes = plt.subplots(1, 2, figsize=(14,5))
    
    axes[0].bar(words_before, counts_before, color='skyblue')
    axes[0].set_title('Top 10 Words Before Stopword Removal')
    axes[0].tick_params(axis='x', rotation=45)
    
    axes[1].bar(words_after, counts_after, color='salmon')
    axes[1].set_title('Top 10 Words After Stopword Removal')
    axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

plot_word_freq(freq_before, freq_after)



ModuleNotFoundError: No module named 'nltk'