## Classical or Traditional Approach

### OHE

In [22]:
import nltk 
# nltk.download('punkt') # Download 'punkt' 
# from nltk if it's not downloaded 
from nltk.tokenize import sent_tokenize 
Text = """Geeks For Geeks. 
		Geeks Learning Together Fast.
        NLP OHE."""
sentences = sent_tokenize(Text) 
sentences = [sent.lower().replace(".", "") for sent in sentences] 
print('Our Corpus :', sentences) 

# Create the vocabulary 
vocab = {}
corpus = " ".join(sentences)
count = 0
for word in corpus.split():
    if word not in vocab:
        count += 1
        vocab[word] = count
print("Our vocabulary : ", vocab)

# One Hot Encoding 
def OneHotEncoder(text): 
	onehot_encoded = [] 
	for word in text.split(): 
		temp = [0]*len(vocab) 
		if word in vocab: 
			temp[vocab[word]-1] = 1
			onehot_encoded.append(temp) 
	return onehot_encoded 


print('\nOneHotEncoded vector for sentence : "', 
	sentences[0], '"is \n', OneHotEncoder(sentences[0]))

# Creating dataset
dataset = []
for sentence in sentences:
    dataset.append(OneHotEncoder(sentence))

print("\nDataset : ", dataset)

Our Corpus : ['geeks for geeks', 'geeks learning together fast', 'nlp ohe']
Our vocabulary :  {'geeks': 1, 'for': 2, 'learning': 3, 'together': 4, 'fast': 5, 'nlp': 6, 'ohe': 7}

OneHotEncoded vector for sentence : " geeks for geeks "is 
 [[1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0]]

Dataset :  [[[1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0]], [[0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 1]]]


### BOW

In [23]:
import nltk 
#nltk.download('punkt') # Download 'punkt' from nltk if it's not downloaded 
from nltk.tokenize import sent_tokenize 
from sklearn.feature_extraction.text import CountVectorizer 
Text = """Geeks For Geeks. 
		Geeks Learning Together Fast.
        NLP OHE."""
# TOKENIZATION 
sentences = sent_tokenize(Text) 
sentences = [sent.lower().replace(".","") for sent in sentences] 
print('\nOur Corpus:',sentences)

#CountVectorizer : Convert a collection of text documents to a matrix of token counts. 
count_vect = CountVectorizer() 
# fit & transform will represent each sentences as BOW representation 
BOW = count_vect.fit_transform(sentences) 
# Get the vocabulary 
print("\nOur vocabulary: ", count_vect.vocabulary_)

#see the BOW representation 
print(f"\nBoW representation for {sentences[0]} {BOW[0].toarray()}") 
print(f"BoW representation for {sentences[1]} {BOW[1].toarray()}") 
print(f"BoW representation for {sentences[2]} {BOW[2].toarray()}") 
# BOW representation for a new text 
BOW_ = count_vect.transform(["learning dsa from AN"]) 
print("\nBow representation for 'learning dsa from AN':", BOW_.toarray())



Our Corpus: ['geeks for geeks', 'geeks learning together fast', 'nlp ohe']

Our vocabulary:  {'geeks': 2, 'for': 1, 'learning': 3, 'together': 6, 'fast': 0, 'nlp': 4, 'ohe': 5}

BoW representation for geeks for geeks [[0 1 2 0 0 0 0]]
BoW representation for geeks learning together fast [[1 0 1 1 0 0 1]]
BoW representation for nlp ohe [[0 0 0 0 1 1 0]]

Bow representation for 'learning dsa from AN': [[0 0 0 1 0 0 0]]


### Bag of N-Grams

In [27]:
import nltk 
# nltk.download('punkt') # Download 'punkt' 
# from nltk if it's not downloaded 
from nltk.tokenize import sent_tokenize 
from sklearn.feature_extraction.text import CountVectorizer 

Text = """Geeks For Geeks. 
		Geeks Learning Together Fast.
        NLP OHE."""

# TOKENIZATION 
sentences = sent_tokenize(Text) 
sentences = [sent.lower().replace(".", "") for sent in sentences] 
print('\nOur Corpus:', sentences) 

# Ngram vectorization example with count 
# vectorizer and uni, bi, trigrams 
count_vect = CountVectorizer(ngram_range=(1, 3)) 

# fit & transform will represent each sentences 
# as Bag of n-grams representation 
BOW_nGram = count_vect.fit_transform(sentences) 

# Get the vocabulary 
print("\nOur vocabulary:", count_vect.vocabulary_) 

# see the Bag of n-grams representation 
print('\nNgram representation for "{}" is {}'
	.format(sentences[0], BOW_nGram[0].toarray())) 
print('Ngram representation for "{}" is {}'
	.format(sentences[1], BOW_nGram[1].toarray())) 
print('Ngram representation for "{}" is {}'. 
	format(sentences[2], BOW_nGram[2].toarray())) 

# Bag of n-grams representation for a new text 
BOW_nGram_ = count_vect.transform(["learning dsa from AN"]) 
print("\nNgram representation for 'learning dsa from AN' is", 
	BOW_nGram_.toarray())



Our Corpus: ['geeks for geeks', 'geeks learning together fast', 'nlp ohe']

Our vocabulary: {'geeks': 3, 'for': 1, 'geeks for': 4, 'for geeks': 2, 'geeks for geeks': 5, 'learning': 8, 'together': 14, 'fast': 0, 'geeks learning': 6, 'learning together': 9, 'together fast': 15, 'geeks learning together': 7, 'learning together fast': 10, 'nlp': 11, 'ohe': 13, 'nlp ohe': 12}

Ngram representation for "geeks for geeks" is [[0 1 1 2 1 1 0 0 0 0 0 0 0 0 0 0]]
Ngram representation for "geeks learning together fast" is [[1 0 0 1 0 0 1 1 1 1 1 0 0 0 1 1]]
Ngram representation for "nlp ohe" is [[0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0]]

Ngram representation for 'learning dsa from AN' is [[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]]


### TFIDF

In [None]:
import nltk 
# nltk.download('punkt') # Download 'punkt' 
# from nltk if it's not downloaded 
from nltk.tokenize import sent_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer 

Text = """Geeks For Geeks. 
		Geeks Learning Together Fast.
        NLP OHE."""

# TOKENIZATION 
sentences = sent_tokenize(Text) 
sentences = [sent.lower().replace(".", "") for sent in sentences] 
print('\nOur Corpus:', sentences) 

# TF-IDF 
tfidf = TfidfVectorizer() 
tfidf_matrix = tfidf.fit_transform(sentences) 

# All words in the vocabulary. 
print("\nvocabulary:", tfidf.get_feature_names_out()) 
# IDF value for all words in the vocabulary 
print("\nIDF for all words in the vocabulary :\n", tfidf.idf_) 

# TFIDF representation for all documents in our corpus 
print('\nTFIDF representation for "{}" is \n{}'
	.format(sentences[0], tfidf_matrix[0].toarray())) 
print('TFIDF representation for "{}" is \n{}'
	.format(sentences[1], tfidf_matrix[1].toarray())) 
print('TFIDF representation for "{}" is \n{}'
	.format(sentences[2],tfidf_matrix[2].toarray())) 

# TFIDF representation for a new text 
matrix = tfidf.transform(["learning dsa from AN"]) 
print("\nTFIDF representation for 'learning dsa from AN' is", 
	matrix.toarray())



Our Corpus: ['geeks for geeks', 'geeks learning together fast', 'nlp ohe']

vocabulary ['fast' 'for' 'geeks' 'learning' 'nlp' 'ohe' 'together']

IDF for all words in the vocabulary :
 [1.69314718 1.69314718 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718]

TFIDF representation for "geeks for geeks" is 
[[0.         0.54935123 0.83559154 0.         0.         0.
  0.        ]]
TFIDF representation for "geeks learning together fast" is 
[[0.52863461 0.         0.40204024 0.52863461 0.         0.
  0.52863461]]
TFIDF representation for "nlp ohe" is 
[[0.         0.         0.         0.         0.70710678 0.70710678
  0.        ]]

TFIDF representation for 'learning dsa from AN' is [[0. 0. 0. 1. 0. 0. 0.]]


## Neural Approach (Word embedding)

### Word2vec by Google

In [31]:
pip install gensim -q

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dataprep 0.4.5 requires pydantic<2.0,>=1.6, but you have pydantic 2.10.4 which is incompatible.
dataprep 0.4.5 requires sqlalchemy==1.3.24, but you have sqlalchemy 2.0.36 which is incompatible.


Note: you may need to restart the kernel to use updated packages.Collecting gensim
  Downloading gensim-4.3.3-cp310-cp310-win_amd64.whl.metadata (8.2 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp310-cp310-win_amd64.whl.metadata (60 kB)
Downloading gensim-4.3.3-cp310-cp310-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---- ----------------------------------- 2.6/24.0 MB 15.0 MB/s eta 0:00:02
   --------- ------------------------------ 5.8/24.0 MB 14.7 MB/s eta 0:00:02
   -------------- ------------------------- 8.9/24.0 MB 14.6 MB/s eta 0:00:02
   ------------------- -------------------- 11.5/24.0 MB 14.4 MB/s eta 0:00:01
   -------------------- ------------------- 12.6/24.0 MB 14.1 MB/s eta 0:00:01
   ------------------------ --------------- 14.9/24.0 MB 12.0 MB/s eta 0:00:01
   ------------------------------ --------- 18.1/24.0 MB 12.4 MB/s eta 0:00:01
   ---------------------------------- ----- 21.0/

In [33]:
import gensim.downloader as api 

# load the pre-trained Word2Vec model 
# model = api.load('word2vec-google-news-300') 

# define word pairs to compute similarity for 
word_pairs = [('learn', 'learning'), ('india', 'indian'), ('fame', 'famous')] 

# compute similarity for each pair of words 
for pair in word_pairs: 
	similarity = model.similarity(pair[0], pair[1]) 
	print(f"Similarity between '{pair[0]}' and '{pair[1]}' using Word2Vec: {similarity:.3f}")

# # Output:
# Similarity between 'learn' and 'learning' using Word2Vec: 0.637
# Similarity between 'india' and 'indian' using Word2Vec: 0.697
# Similarity between 'fame' and 'famous' using Word2Vec: 0.326

### GloVe by Stanford

In [None]:
import torch 
import torchtext.vocab as vocab 

# load the pre-trained GloVe model 
# glove = vocab.GloVe(name='840B', dim=300) 

# define word pairs to compute similarity for 
word_pairs = [('learn', 'learning'), ('india', 'indian'), ('fame', 'famous')] 

# compute similarity for each pair of words 
for pair in word_pairs: 
	vec1, vec2 = glove[pair[0]], glove[pair[1]] 
	similarity = torch.dot(vec1, vec2) / (torch.norm(vec1) * torch.norm(vec2)) 
	print(f"Similarity between '{pair[0]}' and '{pair[1]}' using GloVe: {similarity:.3f}")


# # Output:
# Similarity between 'learn' and 'learning' using GloVe: 0.768
# Similarity between 'india' and 'indian' using GloVe: 0.764
# Similarity between 'fame' and 'famous' using GloVe: 0.507

### fasttext by Facebook

In [None]:
import gensim.downloader as api 

# load the pre-trained fastText model 
# fasttext_model = api.load("fasttext-wiki-news-subwords-300") 

# define word pairs to compute similarity for 
word_pairs = [('learn', 'learning'), ('india', 'indian'), ('fame', 'famous')] 

# compute similarity for each pair of words 
for pair in word_pairs: 
	similarity = fasttext_model.similarity(pair[0], pair[1]) 
	print(f"Similarity between '{pair[0]}' and '{pair[1]}' using Word2Vec: {similarity:.3f}")


# # Output:
# Similarity between 'learn' and 'learning' using Word2Vec: 0.642
# Similarity between 'india' and 'indian' using Word2Vec: 0.708
# Similarity between 'fame' and 'famous' using Word2Vec: 0.519