In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [3]:
vectorizer = CountVectorizer()

In [4]:
bow_matrix = vectorizer.fit_transform(corpus)


In [5]:
feature_name = vectorizer.get_feature_names_out()

In [6]:
bow_dense = bow_matrix.toarray()

In [7]:
print("\nBag of Words Matrix:")
print(bow_dense)


Bag of Words Matrix:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [8]:
print("\nWord to Index Mapping:", vectorizer.vocabulary_)



Word to Index Mapping: {'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}


In [9]:
from sklearn.feature_extraction.text import TfidfTransformer


In [10]:
from sklearn.preprocessing import normalize

In [11]:
from gensim.models import Word2Vec
import numpy as np

In [12]:
# Sample corpus
corpus = [
    "Artificial Intelligence is the future of technology.",
    "Machine learning is a branch of Artificial Intelligence.",
    "Natural Language Processing is an application of Machine Learning."
]


In [13]:
# Sample corpus
corpus = [
    "Artificial Intelligence is the future of technology.",
    "Machine learning is a branch of Artificial Intelligence.",
    "Natural Language Processing is an application of Machine Learning."
]

# Step 1: Bag of Words (BoW) - Count Occurrence
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
print("Feature Names:", feature_names)
print("\nBag of Words Matrix:")
print(bow_matrix.toarray())

# Step 2: Normalize Count Occurrence
normalized_bow = normalize(bow_matrix, norm='l1', axis=1)
print("\nNormalized Bag of Words Matrix:")
print(normalized_bow.toarray())

# Step 3: TF-IDF Transformation
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(bow_matrix)
print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())

# Step 4: Word2Vec Embeddings
# Preprocess corpus for Word2Vec (tokenization)
tokenized_corpus = [sentence.lower().split() for sentence in corpus]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Get word embeddings
print("\nWord2Vec Embeddings:")
for word in word2vec_model.wv.index_to_key:
    print(f"{word}: {word2vec_model.wv[word]}")

Feature Names: ['an' 'application' 'artificial' 'branch' 'future' 'intelligence' 'is'
 'language' 'learning' 'machine' 'natural' 'of' 'processing' 'technology'
 'the']

Bag of Words Matrix:
[[0 0 1 0 1 1 1 0 0 0 0 1 0 1 1]
 [0 0 1 1 0 1 1 0 1 1 0 1 0 0 0]
 [1 1 0 0 0 0 1 1 1 1 1 1 1 0 0]]

Normalized Bag of Words Matrix:
[[0.         0.         0.14285714 0.         0.14285714 0.14285714
  0.14285714 0.         0.         0.         0.         0.14285714
  0.         0.14285714 0.14285714]
 [0.         0.         0.14285714 0.14285714 0.         0.14285714
  0.14285714 0.         0.14285714 0.14285714 0.         0.14285714
  0.         0.         0.        ]
 [0.11111111 0.11111111 0.         0.         0.         0.
  0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
  0.11111111 0.         0.        ]]

TF-IDF Matrix:
[[0.         0.         0.34517852 0.         0.45386827 0.34517852
  0.26806191 0.         0.         0.         0.         0.26806191
  0.         0.4