** Bag-of-words **

In [2]:
!pip install gensim



In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Sample Corpus
corpus = [
    'the cat sat on the mat',
    'the dog sat on the log',
    'cats and dogs are great'
]

# 1. Count Occurrence (Raw Counts)
vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(corpus)

# 2. Normalized Count Occurrence (Term Frequency)
# We divide the counts by the sum of words in the document
from sklearn.preprocessing import normalize
X_normalized = normalize(X_counts, norm='l1', axis=1)

# Display as a Table
df_counts = pd.DataFrame(X_counts.toarray(), columns=vectorizer.get_feature_names_out())
print("Raw Count Occurrence:")
print(df_counts)

print("\nNormalized Count (TF):")
print(pd.DataFrame(X_normalized.toarray(), columns=vectorizer.get_feature_names_out()))

Raw Count Occurrence:
   and  are  cat  cats  dog  dogs  great  log  mat  on  sat  the
0    0    0    1     0    0     0      0    0    1   1    1    2
1    0    0    0     0    1     0      0    1    0   1    1    2
2    1    1    0     1    0     1      1    0    0   0    0    0

Normalized Count (TF):
   and  are       cat  cats       dog  dogs  great       log       mat  \
0  0.0  0.0  0.166667   0.0  0.000000   0.0    0.0  0.000000  0.166667   
1  0.0  0.0  0.000000   0.0  0.166667   0.0    0.0  0.166667  0.000000   
2  0.2  0.2  0.000000   0.2  0.000000   0.2    0.2  0.000000  0.000000   

         on       sat       the  
0  0.166667  0.166667  0.333333  
1  0.166667  0.166667  0.333333  
2  0.000000  0.000000  0.000000  


**TF-IDF (Term Frequency-Inverse Document Frequency)**

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

# Display result
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:")
df_tfidf

TF-IDF Matrix:


Unnamed: 0,and,are,cat,cats,dog,dogs,great,log,mat,on,sat,the
0,0.0,0.0,0.427554,0.0,0.0,0.0,0.0,0.0,0.427554,0.325166,0.325166,0.650331
1,0.0,0.0,0.0,0.0,0.427554,0.0,0.0,0.427554,0.0,0.325166,0.325166,0.650331
2,0.447214,0.447214,0.0,0.447214,0.0,0.447214,0.447214,0.0,0.0,0.0,0.0,0.0


**Word2Vec Embeddings**

In [6]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Fix: Download both punkt and punkt_tab
nltk.download('punkt')
nltk.download('punkt_tab')

# Sample Corpus
corpus = [
    'the cat sat on the mat',
    'the dog sat on the log',
    'cats and dogs are great'
]

# Preprocessing: Word2Vec needs a list of tokenized sentences
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]

# Train the Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Test the output
word = "cat"
print(f"Vector for '{word}':\n", model.wv[word][:10]) # Showing first 10 dimensions

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Vector for 'cat':
 [ 0.00977029  0.00816511  0.00128097  0.00509758  0.00140813 -0.00645516
 -0.00142805  0.00644917 -0.00461731 -0.00399307]
