In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')  # resolves LookupError


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
# Sample text data
data = {
    'Text': [
        "Cats are playing in the garden.",
        "Dogs bark loudly at strangers.",
        "Birds are flying in the sky.",
        "Cats and dogs are friendly pets.",
        "The garden has many beautiful flowers."
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)
print(df)


                                     Text
0         Cats are playing in the garden.
1          Dogs bark loudly at strangers.
2            Birds are flying in the sky.
3        Cats and dogs are friendly pets.
4  The garden has many beautiful flowers.


In [3]:
# Define stopwords, stemmer, and lemmatizer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Define a text preprocessing function
def preprocess(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation
    tokens = nltk.word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    # Apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing
df['Clean_Text'] = df['Text'].apply(preprocess)
print(df)


                                     Text                    Clean_Text
0         Cats are playing in the garden.            cat playing garden
1          Dogs bark loudly at strangers.      dog bark loudly stranger
2            Birds are flying in the sky.               bird flying sky
3        Cats and dogs are friendly pets.          cat dog friendly pet
4  The garden has many beautiful flowers.  garden many beautiful flower


In [4]:
# Initialize CountVectorizer
bow_vectorizer = CountVectorizer()

# Fit and transform the clean text
bow_matrix = bow_vectorizer.fit_transform(df['Clean_Text'])

# Convert to DataFrame for visualization
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())
print("Bag of Words Feature Matrix:")
print(bow_df)


Bag of Words Feature Matrix:
   bark  beautiful  bird  cat  dog  flower  flying  friendly  garden  loudly  \
0     0          0     0    1    0       0       0         0       1       0   
1     1          0     0    0    1       0       0         0       0       1   
2     0          0     1    0    0       0       1         0       0       0   
3     0          0     0    1    1       0       0         1       0       0   
4     0          1     0    0    0       1       0         0       1       0   

   many  pet  playing  sky  stranger  
0     0    0        1    0         0  
1     0    0        0    0         1  
2     0    0        0    1         0  
3     0    1        0    0         0  
4     1    0        0    0         0  


In [5]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the clean text
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Clean_Text'])

# Convert to DataFrame for visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Feature Matrix:")
print(tfidf_df)


TF-IDF Feature Matrix:
       bark  beautiful     bird       cat       dog    flower   flying  \
0  0.000000   0.000000  0.00000  0.531772  0.000000  0.000000  0.00000   
1  0.523358   0.000000  0.00000  0.000000  0.422242  0.000000  0.00000   
2  0.000000   0.000000  0.57735  0.000000  0.000000  0.000000  0.57735   
3  0.000000   0.000000  0.00000  0.444002  0.444002  0.000000  0.00000   
4  0.000000   0.523358  0.00000  0.000000  0.000000  0.523358  0.00000   

   friendly    garden    loudly      many       pet   playing      sky  \
0  0.000000  0.531772  0.000000  0.000000  0.000000  0.659118  0.00000   
1  0.000000  0.000000  0.523358  0.000000  0.000000  0.000000  0.00000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.57735   
3  0.550329  0.000000  0.000000  0.000000  0.550329  0.000000  0.00000   
4  0.000000  0.422242  0.000000  0.523358  0.000000  0.000000  0.00000   

   stranger  
0  0.000000  
1  0.523358  
2  0.000000  
3  0.000000  
4  0.000000  


In [6]:
import pandas as pd

# Display the BoW matrix
print("\nBag of Words Matrix:")
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())
print(bow_df)

# Display the TF-IDF matrix
print("\nTF-IDF Matrix:")
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(tfidf_df)

# Compare word presence vs importance
print("\nTop words based on TF-IDF importance in each document:")
for i, row in tfidf_df.iterrows():
    top_words = row.sort_values(ascending=False).head(3)
    print(f"Doc {i+1}:")
    for word, score in top_words.items():
        print(f"   {word}: {score:.4f}")



Bag of Words Matrix:
   bark  beautiful  bird  cat  dog  flower  flying  friendly  garden  loudly  \
0     0          0     0    1    0       0       0         0       1       0   
1     1          0     0    0    1       0       0         0       0       1   
2     0          0     1    0    0       0       1         0       0       0   
3     0          0     0    1    1       0       0         1       0       0   
4     0          1     0    0    0       1       0         0       1       0   

   many  pet  playing  sky  stranger  
0     0    0        1    0         0  
1     0    0        0    0         1  
2     0    0        0    1         0  
3     0    1        0    0         0  
4     1    0        0    0         0  

TF-IDF Matrix:
       bark  beautiful     bird       cat       dog    flower   flying  \
0  0.000000   0.000000  0.00000  0.531772  0.000000  0.000000  0.00000   
1  0.523358   0.000000  0.00000  0.000000  0.422242  0.000000  0.00000   
2  0.000000   0.000000  0