In [None]:
# Step 1: Import necessary libraries
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Download necessary NLTK data
nltk.download('punkt')           # Sentence tokenizer
nltk.download('stopwords')       # Stopwords
nltk.download('wordnet')         # Lemmatizer
nltk.download('omw-1.4')
nltk.download('punkt_tab')       # Punkt tokenizer models

# Step 2: Define a small sample dataset
documents = [
    "Natural Language Processing is fun!",
    "I love studying NLP. It's exciting and challenging.",
    "NLP includes tasks like tokenization, stemming, and lemmatization.",
    "Feature extraction is important in machine learning.",
    "Text preprocessing improves model accuracy."
]

# Step 3: Text Preprocessing Function
def preprocess(text):
    # a. Convert to lowercase
    text = text.lower()

    # b. Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)

    # c. Tokenize
    tokens = nltk.word_tokenize(text)

    # d. Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # e. Apply stemming or lemmatization
    stemmer = PorterStemmer()
    # Alternative: lemmatizer = WordNetLemmatizer()
    tokens = [stemmer.stem(word) for word in tokens]
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]  # (for lemmatization instead)

    return " ".join(tokens)

# Apply preprocessing
processed_docs = [preprocess(doc) for doc in documents]

# Show processed text
print("Processed Documents:")
for i, doc in enumerate(processed_docs):
    print(f"Doc {i+1}: {doc}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Processed Documents:
Doc 1: natur languag process fun
Doc 2: love studi nlp excit challeng
Doc 3: nlp includ task like token stem lemmat
Doc 4: featur extract import machin learn
Doc 5: text preprocess improv model accuraci


In [None]:
# a. Bag-of-Words
bow_vectorizer = CountVectorizer()
bow_features = bow_vectorizer.fit_transform(processed_docs)
bow_df = pd.DataFrame(bow_features.toarray(), columns=bow_vectorizer.get_feature_names_out())

print("\nBag-of-Words Representation:")
display(bow_df)

# b. TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(processed_docs)
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Representation:")
display(tfidf_df)



Bag-of-Words Representation:


Unnamed: 0,accuraci,challeng,excit,extract,featur,fun,import,improv,includ,languag,...,model,natur,nlp,preprocess,process,stem,studi,task,text,token
0,0,0,0,0,0,1,0,0,0,1,...,0,1,0,0,1,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,1,0,1
3,0,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,1,0



TF-IDF Representation:


Unnamed: 0,accuraci,challeng,excit,extract,featur,fun,import,improv,includ,languag,...,model,natur,nlp,preprocess,process,stem,studi,task,text,token
0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5,...,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
1,0.0,0.463693,0.463693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.374105,0.0,0.0,0.0,0.463693,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.387757,0.0,...,0.0,0.0,0.31284,0.0,0.0,0.387757,0.0,0.387757,0.0,0.387757
3,0.0,0.0,0.0,0.447214,0.447214,0.0,0.447214,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,...,0.447214,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.447214,0.0


In [None]:
print("Comparison:")
print("BoW counts how many times a word appears in a document.")
print("TF-IDF gives more weight to rare and important words across the corpus.")

# Example: Show weights of common vs rare terms
print("\nWords with high TF-IDF values indicate importance in specific documents.")


Comparison:
BoW counts how many times a word appears in a document.
TF-IDF gives more weight to rare and important words across the corpus.

Words with high TF-IDF values indicate importance in specific documents.
