**bold text**# **Perform tokenization, stopword removal, stemming, and lemmatization on a sample dataset. Compare how these preprocessing steps impact the quality of text representation.**

In [None]:
!pip install nltk pandas



In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
df = pd.read_csv("/content/flipkart_product.csv", encoding='latin1')
df = df.dropna().head(3000)   # take 3000 rows for speed
df.head()

Unnamed: 0,ProductName,Price,Rate,Review,Summary
0,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",5,Super!,Great cooler.. excellent air flow and for this...
1,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",5,Awesome,Best budget 2 fit cooler. Nice cooling
2,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",3,Fair,The quality is good but the power of air is de...
3,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",1,Useless product,Very bad product it's a only a fan
4,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",3,Fair,Ok ok product


In [None]:
df = pd.read_csv("/content/flipkart_product.csv", usecols=['Rate', 'Review'], engine='python', on_bad_lines='skip', encoding='latin1')
df = df.dropna().head(3000)   # take 3000 rows for speed
df.head()

Unnamed: 0,Rate,Review
0,5,Super!
1,5,Awesome
2,3,Fair
3,1,Useless product
4,3,Fair


#Data Preprocessing

In [None]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # 1. Tokenization
    tokens = word_tokenize(text.lower())

    # 2. Stopword Removal + keep alphabetic words only
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]

    # 3. Stemming
    stemmed = [stemmer.stem(w) for w in filtered]

    # 4. Lemmatization
    lemmatized = [lemmatizer.lemmatize(w) for w in filtered]

    return tokens, filtered, stemmed, lemmatized


In [None]:
import nltk
nltk.download('punkt_tab')

df["tokens"] = df["Review"].apply(lambda x: preprocess(x)[0])
df["no_stopwords"] = df["Review"].apply(lambda x: preprocess(x)[1])
df["stemmed"] = df["Review"].apply(lambda x: preprocess(x)[2])
df["lemmatized"] = df["Review"].apply(lambda x: preprocess(x)[3])

df.head()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,Rate,Review,tokens,no_stopwords,stemmed,lemmatized
0,5,Super!,"[super, !]",[super],[super],[super]
1,5,Awesome,[awesome],[awesome],[awesom],[awesome]
2,3,Fair,[fair],[fair],[fair],[fair]
3,1,Useless product,"[useless, product]","[useless, product]","[useless, product]","[useless, product]"
4,3,Fair,[fair],[fair],[fair],[fair]


In [None]:
def get_vocab_size(list_of_docs):
    vocab = set()
    for doc in list_of_docs:
        vocab.update(doc)
    return len(vocab)

# Ensure preprocessing columns are present before calculating vocabulary size
# This addresses potential inconsistencies if previous cells were not run or state was lost
df["tokens"] = df["Review"].apply(lambda x: preprocess(x)[0])
df["no_stopwords"] = df["Review"].apply(lambda x: preprocess(x)[1])
df["stemmed"] = df["Review"].apply(lambda x: preprocess(x)[2])
df["lemmatized"] = df["Review"].apply(lambda x: preprocess(x)[3])

results = {
    "Original Tokens": get_vocab_size(df["tokens"]),
    "After Stopword Removal": get_vocab_size(df["no_stopwords"]),
    "After Stemming": get_vocab_size(df["stemmed"]),
    "After Lemmatization": get_vocab_size(df["lemmatized"])
}

pd.DataFrame(results, index=["Vocabulary Size"])

Unnamed: 0,Original Tokens,After Stopword Removal,After Stemming,After Lemmatization
Vocabulary Size,149,111,108,110
