<a href="https://colab.research.google.com/github/R786P/data-science-roadmap-2025_2026/blob/main/15_nlp_basics/01_text_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pygit2==1.15.1
%cd /content
!git clone https://github.com/lllyasviel/Fooocus.git
%cd /content/Fooocus
!python entry_with_update.py --share --always-high-vram

In [24]:

# 15_nlp_basics/01_text_preprocessing.ipynb
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string

# Download NLTK data (only once)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)

# Sample customer reviews
reviews = [
    "I love this product! It's amazing and works perfectly.",
    "Terrible service. I hate it and will never buy again.",
    "Good quality but delivery was very slow.",
    "Excellent value for money. Highly recommended!",
    "Not bad, but could be better."
]
df = pd.DataFrame({'review': reviews})

print("üìù Original Reviews:")
for i, r in enumerate(df['review'], 1):
    print(f"{i}. {r}")

# Text Preprocessing
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(w) for w in tokens]
    return ' '.join(tokens)

df['processed'] = df['review'].apply(preprocess_text)

print("\nüîß Processed Reviews:")
for i, r in enumerate(df['processed'], 1):
    print(f"{i}. {r}")

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['processed'])

print(f"\nüìä TF-IDF Matrix Shape: {tfidf_matrix.shape}")
print("Top 5 features:", vectorizer.get_feature_names_out()[:5])

üìù Original Reviews:
1. I love this product! It's amazing and works perfectly.
2. Terrible service. I hate it and will never buy again.
3. Good quality but delivery was very slow.
4. Excellent value for money. Highly recommended!
5. Not bad, but could be better.

üîß Processed Reviews:
1. love product amaz work perfectli
2. terribl servic hate never buy
3. good qualiti deliveri slow
4. excel valu money highli recommend
5. bad could better

üìä TF-IDF Matrix Shape: (5, 22)
Top 5 features: ['amaz' 'bad' 'better' 'buy' 'could']
