**Perform bag-of-words approach (count occurrence, normalized count occurrence), TF-IDF on
data. Create embeddings using Word2Vec**


In [None]:
#1. Bag-of-Words (BoW)

#In this method, we convert each document into a "bag" of words (ignoring grammar, word order, and even capitalization),
#and we count how many times each word appears in the document.

In [None]:
#2. TF-IDF (Term Frequency - Inverse Document Frequency)

# TF-IDF is a measure that helps identify the importance of a word in a document relative to the entire collection of documents.
# Words that appear often in one document but rarely in others are considered important.

In [None]:
#3. Word2Vec

#Word2Vec is a technique that converts words into vectors (numbers), and words with similar meanings have
#similar vectors. This is done by training a machine learning model on your data.

In [None]:
# Step 1: Install necessary libraries
!pip install scikit-learn gensim  # Install required libraries in Google Colab



In [None]:
# Import necessary libraries
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec


In [None]:
# Sample data: list of documents (text)
documents = [
    "I love programming in Python",
    "Python is a great programming language",
    "I love coding",
    "I prefer Python over other programming languages",
    "Python programming is fun"
]


In [None]:
# 1. **Bag-of-Words (BoW) Approach**:
# We use CountVectorizer to convert the text into word counts
vectorizer_bow = CountVectorizer()  # Initialize the vectorizer
X_bow = vectorizer_bow.fit_transform(documents)  # Fit the model and transform the documents


In [None]:
# Print the Bag-of-Words matrix (each row represents a document, each column a word)
print("Bag-of-Words Matrix:")
print(X_bow.toarray())  # .toarray() converts the sparse matrix to a dense array


Bag-of-Words Matrix:
[[0 0 0 1 0 0 0 1 0 0 0 1 1]
 [0 0 1 0 1 1 0 0 0 0 0 1 1]
 [1 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 1 1 1 1 1]
 [0 1 0 0 1 0 0 0 0 0 0 1 1]]


In [None]:
# 2. **TF-IDF Approach**:
# We use TfidfVectorizer to calculate the importance of each word
vectorizer_tfidf = TfidfVectorizer()  # Initialize the vectorizer
X_tfidf = vectorizer_tfidf.fit_transform(documents)  # Fit the model and transform the documents

In [None]:
# Print the TF-IDF matrix (importance of each word in the documents)
print("\nTF-IDF Matrix:")
print(X_tfidf.toarray())  # .toarray() converts the sparse matrix to a dense array


TF-IDF Matrix:
[[0.         0.         0.         0.6614376  0.         0.
  0.         0.53364369 0.         0.         0.         0.3726424
  0.3726424 ]
 [0.         0.         0.55167715 0.         0.44508965 0.55167715
  0.         0.         0.         0.         0.         0.31080528
  0.31080528]
 [0.77828292 0.         0.         0.         0.         0.
  0.         0.62791376 0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.46449871 0.         0.46449871 0.46449871 0.46449871 0.26169047
  0.26169047]
 [0.         0.6614376  0.         0.         0.53364369 0.
  0.         0.         0.         0.         0.         0.3726424
  0.3726424 ]]


In [None]:
# 3. **Word2Vec Approach**:
# Word2Vec converts words to numerical vectors. We train a model on the documents.
# First, we split the documents into a list of words.
tokenized_docs = [doc.lower().split() for doc in documents]  # Convert each document to a list of lowercase words

In [None]:
# Create and train the Word2Vec model using the tokenized documents
model_w2v = Word2Vec(sentences=tokenized_docs, vector_size=10, window=3, min_count=1)

In [None]:
# Print the word vectors (embeddings) for a few words
print("\nWord2Vec Embeddings:")
for word in ["python", "programming", "love"]:  # Example words to check
    print(f"Embedding for '{word}': {model_w2v.wv[word]}")  # Get the vector (embedding) for the word



Word2Vec Embeddings:
Embedding for 'python': [-0.00536227  0.00236431  0.0510335   0.09009273 -0.0930295  -0.07116809
  0.06458873  0.08972988 -0.05015428 -0.03763372]
Embedding for 'programming': [ 0.07380123 -0.01533396 -0.04536377  0.06553721 -0.04859911 -0.01815925
  0.02876429  0.00991821 -0.08284786 -0.09448329]
Embedding for 'love': [-0.0960355   0.05007293 -0.08759586 -0.04391825 -0.000351   -0.00296181
 -0.0766124   0.09614743  0.04982058  0.09233143]
