In [1]:
# Required libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.preprocessing import normalize
import pandas as pd
import numpy as np

In [2]:
# Sample data
corpus = [
    "I love machine learning and data science",
    "Data science is fun and exciting",
    "Machine learning is a key skill in AI",
    "I enjoy learning new things"
]

In [3]:
# Bag-of-Words (Count Occurrence)
count_occurrence = CountVectorizer()
X_count = count_occurrence.fit_transform(corpus)
df_count = pd.DataFrame(X_count.toarray(), columns=count_occurrence.get_feature_names_out())
print("Bag of Words - Count Occurrence:\n", df_count)

Bag of Words - Count Occurrence:
    ai  and  data  enjoy  exciting  fun  in  is  key  learning  love  machine  \
0   0    1     1      0         0    0   0   0    0         1     1        1   
1   0    1     1      0         1    1   0   1    0         0     0        0   
2   1    0     0      0         0    0   1   1    1         1     0        1   
3   0    0     0      1         0    0   0   0    0         1     0        0   

   new  science  skill  things  
0    0        1      0       0  
1    0        1      0       0  
2    0        0      1       0  
3    1        0      0       1  


In [4]:
# Normalized Count Occurrence (L1 norm)
X_count_norm = normalize(X_count, norm='l1')
df_count_norm = pd.DataFrame(X_count_norm.toarray(), columns=count_occurrence.get_feature_names_out())
print("\nBag of Words - Normalized Count Occurrence:\n", df_count_norm)


Bag of Words - Normalized Count Occurrence:
          ai       and      data  enjoy  exciting       fun        in  \
0  0.000000  0.166667  0.166667   0.00  0.000000  0.000000  0.000000   
1  0.000000  0.166667  0.166667   0.00  0.166667  0.166667  0.000000   
2  0.142857  0.000000  0.000000   0.00  0.000000  0.000000  0.142857   
3  0.000000  0.000000  0.000000   0.25  0.000000  0.000000  0.000000   

         is       key  learning      love   machine   new   science     skill  \
0  0.000000  0.000000  0.166667  0.166667  0.166667  0.00  0.166667  0.000000   
1  0.166667  0.000000  0.000000  0.000000  0.000000  0.00  0.166667  0.000000   
2  0.142857  0.142857  0.142857  0.000000  0.142857  0.00  0.000000  0.142857   
3  0.000000  0.000000  0.250000  0.000000  0.000000  0.25  0.000000  0.000000   

   things  
0    0.00  
1    0.00  
2    0.00  
3    0.25  


In [5]:
# TF-IDF
tfidf_occurrence = TfidfVectorizer()
X_tfidf = tfidf_occurrence.fit_transform(corpus)
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_occurrence.get_feature_names_out())
print("\nTF-IDF:\n", df_tfidf)


TF-IDF:
          ai       and      data     enjoy  exciting      fun        in  \
0  0.000000  0.399546  0.399546  0.000000   0.00000  0.00000  0.000000   
1  0.000000  0.372225  0.372225  0.000000   0.47212  0.47212  0.000000   
2  0.420681  0.000000  0.000000  0.000000   0.00000  0.00000  0.420681   
3  0.000000  0.000000  0.000000  0.541736   0.00000  0.00000  0.000000   

         is       key  learning      love   machine       new   science  \
0  0.000000  0.000000  0.323467  0.506774  0.399546  0.000000  0.399546   
1  0.372225  0.000000  0.000000  0.000000  0.000000  0.000000  0.372225   
2  0.331670  0.420681  0.268515  0.000000  0.331670  0.000000  0.000000   
3  0.000000  0.000000  0.345783  0.000000  0.000000  0.541736  0.000000   

      skill    things  
0  0.000000  0.000000  
1  0.000000  0.000000  
2  0.420681  0.000000  
3  0.000000  0.541736  


In [6]:
# Word2Vec Embeddings
tokenized_corpus = [sentence.lower().split() for sentence in corpus]
w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=50, window=2, min_count=1, workers=1, sg=1)

In [7]:
print("\nWord2Vec Embeddings for word 'learning':\n", w2v_model.wv['learning'])


Word2Vec Embeddings for word 'learning':
 [-1.07199990e-03  4.72599320e-04  1.02075171e-02  1.80195551e-02
 -1.86056513e-02 -1.42333433e-02  1.29171927e-02  1.79462414e-02
 -1.00304112e-02 -7.52587756e-03  1.47622460e-02 -3.06692114e-03
 -9.07294545e-03  1.31098200e-02 -9.72019415e-03 -3.63100530e-03
  5.75345429e-03  1.98484841e-03 -1.65706240e-02 -1.88982189e-02
  1.46249318e-02  1.01408092e-02  1.35151129e-02  1.52485142e-03
  1.27026178e-02 -6.81040483e-03 -1.89279346e-03  1.15381200e-02
 -1.50436601e-02 -7.87345599e-03 -1.50244525e-02 -1.85924501e-03
  1.90767795e-02 -1.46377487e-02 -4.66758432e-03 -3.87661578e-03
  1.61548574e-02 -1.18620954e-02  9.02669344e-05 -9.50748939e-03
 -1.92084312e-02  1.00153638e-02 -1.75192691e-02 -8.78333300e-03
 -7.02316029e-05 -5.92096709e-04 -1.53229153e-02  1.92290172e-02
  9.96463653e-03  1.84666868e-02]


In [8]:
# Average Word2Vec vector for each sentence
def average_vector(sentence, model):
    words = sentence.lower().split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

sentence_vectors = np.array([average_vector(sent, w2v_model) for sent in corpus])
print("\nAverage Word2Vec Embedding for each sentence:\n", pd.DataFrame(sentence_vectors))


Average Word2Vec Embedding for each sentence:
          0         1         2         3         4         5         6   \
0 -0.004315  0.000160 -0.004318  0.002177  0.002283 -0.000606  0.004791   
1 -0.002559  0.000874 -0.005111 -0.007262 -0.000969  0.000705  0.004257   
2 -0.001829 -0.003584  0.006683  0.006456 -0.006254 -0.003310  0.004823   
3 -0.003987  0.007239  0.002202  0.001972 -0.000679 -0.006440  0.005232   

         7         8         9   ...        40        41        42        43  \
0  0.004340 -0.008858  0.004825  ... -0.002088 -0.000790 -0.003911 -0.000193   
1  0.005389 -0.003318 -0.003677  ... -0.000372  0.001598  0.004124 -0.003931   
2  0.001387 -0.005990 -0.008739  ...  0.006626  0.000103 -0.006082 -0.002109   
3 -0.000755 -0.004322 -0.000373  ... -0.005865  0.001220 -0.000360 -0.004869   

         44        45        46        47        48        49  
0  0.006258  0.005233  0.001550 -0.000956  0.001853  0.001929  
1  0.008566  0.006844  0.005785  0.002550 -0.00