In [1]:
!pip install scikit-learn




In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd


In [3]:
sentences = [
    "Natural Language Processing is amazing",
    "I love learning NLP and machine learning",
    "NLP helps computers understand human language"
]


In [4]:
# Create BoW vectorizer
bow_vectorizer = CountVectorizer()

# Fit and transform sentences
bow_matrix = bow_vectorizer.fit_transform(sentences)

# Convert to DataFrame for easy visualization
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())

print("Bag-of-Words Matrix:\n")
print(bow_df)


Bag-of-Words Matrix:

   amazing  and  computers  helps  human  is  language  learning  love  \
0        1    0          0      0      0   1         1         0     0   
1        0    1          0      0      0   0         0         2     1   
2        0    0          1      1      1   0         1         0     0   

   machine  natural  nlp  processing  understand  
0        0        1    0           1           0  
1        1        0    1           0           0  
2        0        0    1           0           1  


In [5]:
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform sentences
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

# Convert to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:\n")
print(tfidf_df)



TF-IDF Matrix:

    amazing       and  computers     helps     human        is  language  \
0  0.467351  0.000000   0.000000  0.000000  0.000000  0.467351  0.355432   
1  0.000000  0.363255   0.000000  0.000000  0.000000  0.000000  0.000000   
2  0.000000  0.000000   0.440362  0.440362  0.440362  0.000000  0.334907   

   learning      love   machine   natural       nlp  processing  understand  
0  0.000000  0.000000  0.000000  0.467351  0.000000    0.467351    0.000000  
1  0.726509  0.363255  0.363255  0.000000  0.276265    0.000000    0.000000  
2  0.000000  0.000000  0.000000  0.000000  0.334907    0.000000    0.440362  


In [6]:
# Example: compare BoW vs TF-IDF for the word 'nlp'
word = 'nlp'

bow_scores = bow_df[word] if word in bow_df.columns else None
tfidf_scores = tfidf_df[word] if word in tfidf_df.columns else None

print(f"\nWord '{word}' Importance:")
print("BoW counts:", bow_scores.tolist() if bow_scores is not None else "Not in vocab")
print("TF-IDF scores:", tfidf_scores.tolist() if tfidf_scores is not None else "Not in vocab")



Word 'nlp' Importance:
BoW counts: [0, 1, 1]
TF-IDF scores: [0.0, 0.2762645695949752, 0.3349067026613031]
