In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize # Import normalize
from gensim.models import Word2Vec

documents = [
    "I love machine learning",
    "Machine learning is fun",
    "I love deep learning",
    "Deep learning is powerful"
]

print("\n===== ORIGINAL DOCUMENTS =====")
for i, doc in enumerate(documents):
    print(f"{i+1}. {doc}")


bow_vectorizer = CountVectorizer()
bow = bow_vectorizer.fit_transform(documents)

bow_df = pd.DataFrame(
    bow.toarray(),
    columns=bow_vectorizer.get_feature_names_out()
)

print("\n===== BAG OF WORDS (COUNT) =====")
print(bow_df)


bow_norm_vectorizer = CountVectorizer()
bow_raw_counts = bow_norm_vectorizer.fit_transform(documents)
bow_norm = normalize(bow_raw_counts, norm='l1', axis=1)

bow_norm_df = pd.DataFrame(
    bow_norm.toarray(),
    columns=bow_norm_vectorizer.get_feature_names_out()
)

print("\n===== NORMALIZED BAG OF WORDS (L1) =====")
print(bow_norm_df)

tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(documents)

tfidf_df = pd.DataFrame(
    tfidf.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print("\n===== TF-IDF =====")
print(tfidf_df)


tokenized_docs = [doc.lower().split() for doc in documents]

w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=50,
    window=5,
    min_count=1,
    workers=4
)

print("\n===== WORD2VEC EMBEDDING (sample word: 'learning') ====")
print(w2v_model.wv['learning'])


def document_embedding(doc):
    words = doc.lower().split()
    vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    return np.mean(vectors, axis=0)

doc_embeddings = np.array([document_embedding(doc) for doc in documents])

print("\n===== DOCUMENT EMBEDDING SHAPE =====")
print(doc_embeddings.shape)

print("\n===== DOCUMENT EMBEDDING (Doc 1) ====")
print(doc_embeddings[0])


===== ORIGINAL DOCUMENTS =====
1. I love machine learning
2. Machine learning is fun
3. I love deep learning
4. Deep learning is powerful

===== BAG OF WORDS (COUNT) =====
   deep  fun  is  learning  love  machine  powerful
0     0    0   0         1     1        1         0
1     0    1   1         1     0        1         0
2     1    0   0         1     1        0         0
3     1    0   1         1     0        0         1

===== NORMALIZED BAG OF WORDS (L1) =====
       deep   fun    is  learning      love   machine  powerful
0  0.000000  0.00  0.00  0.333333  0.333333  0.333333      0.00
1  0.000000  0.25  0.25  0.250000  0.000000  0.250000      0.00
2  0.333333  0.00  0.00  0.333333  0.333333  0.000000      0.00
3  0.250000  0.00  0.25  0.250000  0.000000  0.000000      0.25

===== TF-IDF =====
       deep       fun        is  learning      love   machine  powerful
0  0.000000  0.000000  0.000000  0.423897  0.640434  0.640434  0.000000
1  0.000000  0.630504  0.497096  0.329023