In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Sample dataset
documents = [
    "I love programming in Python.",
    "Python is a great language for machine learning.",
    "Machine learning and deep learning are exciting fields.",
    "Deep learning is a subset of machine learning."
]

In [None]:
### 1. Bag-of-Words (BoW)
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(documents)
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
print("Bag-of-Words Representation:")
print(bow_df)

Bag-of-Words Representation:
   and  are  deep  exciting  fields  for  great  in  is  language  learning  \
0    0    0     0         0       0    0      0   1   0         0         0   
1    0    0     0         0       0    1      1   0   1         1         1   
2    1    1     1         1       1    0      0   0   0         0         2   
3    0    0     1         0       0    0      0   0   1         0         2   

   love  machine  of  programming  python  subset  
0     1        0   0            1       1       0  
1     0        1   0            0       1       0  
2     0        1   0            0       0       0  
3     0        1   1            0       0       1  


In [None]:
### 2. TF-IDF
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(documents)
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer_tfidf.get_feature_names_out())
print("\nTF-IDF Representation:")
print(tfidf_df)


TF-IDF Representation:
        and       are      deep  exciting    fields       for     great  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.000000  0.444642  0.444642   
2  0.387532  0.387532  0.305534  0.387532  0.387532  0.000000  0.000000   
3  0.000000  0.000000  0.343104  0.000000  0.000000  0.000000  0.000000   

         in        is  language  learning      love   machine        of  \
0  0.525473  0.000000  0.000000  0.000000  0.525473  0.000000  0.000000   
1  0.000000  0.350561  0.444642  0.283809  0.000000  0.283809  0.000000   
2  0.000000  0.000000  0.000000  0.494713  0.000000  0.247356  0.000000   
3  0.000000  0.343104  0.000000  0.555545  0.000000  0.277773  0.435184   

   programming    python    subset  
0     0.525473  0.414289  0.000000  
1     0.000000  0.350561  0.000000  
2     0.000000  0.000000  0.000000  
3     0.000000  0.000000  0.435184  


In [None]:
### 3. Word2Vec
# Tokenize sentences
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]

In [None]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
# Example: Get vector for a word
word = "python"
if word in word2vec_model.wv:
    print(f"\nWord2Vec representation for '{word}':")
    print(word2vec_model.wv[word])
else:
    print(f"\nWord '{word}' not in vocabulary.")



Word2Vec representation for 'python':
[-8.7294905e-03  2.1309664e-03 -8.7221497e-04 -9.3176709e-03
 -9.4262864e-03 -1.4121713e-03  4.4327485e-03  3.7056087e-03
 -6.4993422e-03 -6.8745553e-03 -4.9994662e-03 -2.2889308e-03
 -7.2515626e-03 -9.6015455e-03 -2.7428993e-03 -8.3611486e-03
 -6.0372930e-03 -5.6691379e-03 -2.3450621e-03 -1.7070804e-03
 -8.9564817e-03 -7.3632010e-04  8.1544742e-03  7.6879794e-03
 -7.2045443e-03 -3.6662007e-03  3.1174424e-03 -9.5695946e-03
  1.4760050e-03  6.5260385e-03  5.7487735e-03 -8.7641701e-03
 -4.5173578e-03 -8.1414981e-03  4.6832771e-05  9.2643471e-03
  5.9750592e-03  5.0687077e-03  5.0633666e-03 -3.2406410e-03
  9.5541235e-03 -7.3580840e-03 -7.2726533e-03 -2.2655146e-03
 -7.7928969e-04 -3.2141213e-03 -5.9111224e-04  7.4884510e-03
 -6.9720595e-04 -1.6244808e-03  2.7462984e-03 -8.3614178e-03
  7.8558167e-03  8.5369283e-03 -9.5843021e-03  2.4482766e-03
  9.9071162e-03 -7.6642046e-03 -6.9670668e-03 -7.7347141e-03
  8.3938064e-03 -6.8061694e-04  9.1432426e-03 