Why to represent Texts into Vectors?

Ans : Any kind of machine learning, deep learning or statistical learning platform understands only numbers.

How to represent Texts into Vectors?

1. One Hot Encoding
2. Bag of Words Model
3. TF-IDF
4. Word2Vec
5. FastText
6. GLOve
7. BERT

References

https://www.analyticsvidhya.com/blog/2023/07/step-by-step-guide-to-word2vec-with-gensim/

https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html

https://radimrehurek.com/gensim/models/word2vec.html

https://analyticsindiamag.com/word2vec-vs-glove-a-comparative-guide-to-word-embedding-techniques/

https://medium.com/intelligentmachines/word-embedding-and-one-hot-encoding-ad17b4bbe111



In [1]:
!pip install --upgrade gensim -q

In [2]:
# Defining Corpus

corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

1. One Hot Encoding

In [15]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# define example
values = array(corpus)
print(values)

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

# # invert first example
# inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
# print(inverted)

['This is the first document.' 'This document is the second document.'
 'And this is the third one.' 'Is this the first document?']
[3 2 0 1]
[[0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]]




2. Bag Of Words Model

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(X.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [2]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

3. TF-IDF Model

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(X.toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [6]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

4. Word2Vec

In [3]:
import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import spacy
import string
np.random.seed(42)

In [34]:
def sent_vec(sent, model):

    vector_size = model.wv.vector_size
    wv_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 0
    for w in sent:
        if w in model.wv: #model.wv.index_to_key
            ctr += 1
            wv_res += model.wv[w]
    if ctr != 0:
      wv_res = wv_res/ctr
    else:
      wv_res = wv_res
    return wv_res

In [5]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):

    doc = nlp(sentence)

    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    mytokens = [ word for word in mytokens if word not in punctuations ] #word not in stop_words and

    return mytokens

In [6]:
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

In [24]:
df = pd.DataFrame(corpus, columns = ['text'])
df['tokens'] = df['text'].apply(spacy_tokenizer)
df['tokens']

In [17]:
# w2v_model = Word2Vec(min_count=1,
#                      window=2,
#                      vector_size=300,
#                      sample=6e-5,
#                      alpha=0.03,
#                      min_alpha=0.0007,
#                      negative=20,
#                      workers=4)

In [18]:
# w2v_model.build_vocab(df['tokens']) #, progress_per=10000

In [20]:
# w2v_model.train(df['tokens'], total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

(15, 660)

In [39]:
import gensim.models

model = gensim.models.Word2Vec(sentences=df['tokens'], min_count = 1, compute_loss = True)
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")

In [40]:
model.wv['document']

array([-5.3622725e-04,  2.3643136e-04,  5.1033497e-03,  9.0092728e-03,
       -9.3029495e-03, -7.1168090e-03,  6.4588725e-03,  8.9729885e-03,
       -5.0154282e-03, -3.7633716e-03,  7.3805046e-03, -1.5334714e-03,
       -4.5366134e-03,  6.5540518e-03, -4.8601604e-03, -1.8160177e-03,
        2.8765798e-03,  9.9187379e-04, -8.2852151e-03, -9.4488179e-03,
        7.3117660e-03,  5.0702621e-03,  6.7576934e-03,  7.6286553e-04,
        6.3508903e-03, -3.4053659e-03, -9.4640139e-04,  5.7685734e-03,
       -7.5216377e-03, -3.9361035e-03, -7.5115822e-03, -9.3004224e-04,
        9.5381187e-03, -7.3191668e-03, -2.3337686e-03, -1.9377411e-03,
        8.0774371e-03, -5.9308959e-03,  4.5162440e-05, -4.7537340e-03,
       -9.6035507e-03,  5.0072931e-03, -8.7595852e-03, -4.3918253e-03,
       -3.5099984e-05, -2.9618145e-04, -7.6612402e-03,  9.6147433e-03,
        4.9820580e-03,  9.2331432e-03, -8.1579173e-03,  4.4957981e-03,
       -4.1370760e-03,  8.2453608e-04,  8.4986202e-03, -4.4621765e-03,
      

In [35]:
df['vec'] = df['tokens'].apply(sent_vec, args = (model,))

In [37]:
len(df['vec'][0])

100

In [41]:
model.get_latest_training_loss()

28.920080184936523

5. FastText

In [44]:
from gensim.models import FastText


embedding_size = 300
window_size = 5
#min_word = 5
#down_sampling = 1e-2
model = FastText(vector_size=embedding_size, window=window_size, min_count=1, sentences=list(df['tokens']), epochs=1)

In [None]:
# from gensim.test.utils import get_tmpfile
# model.save(get_tmpfile("fasttext.model"))
# model = FastText.load(get_tmpfile("fasttext.model"))

In [46]:
model.save("fasttext.model")
model = FastText.load("fasttext.model")

In [48]:
len(model.wv['document'])

300

In [49]:
df['ftvec'] = df['tokens'].apply(sent_vec, args = (model,))

6. GLOve

In [53]:
!pip install swifter -qqq
!pip install glove-python3 -qqq

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/327.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/327.0 kB[0m [31m1.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/327.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m317.4/327.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.0/327.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for glove-python3 (setup.py) ... [?25l[?25hdone


In [54]:
from glove import Corpus, Glove

#Creating a corpus object
corpus = Corpus()

#Training the corpus to generate the co-occurrence matrix which is used in GloVe
corpus.fit(list(df['tokens']), window=10)

glove = Glove(no_components=300, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=5, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')

Performing 5 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4


In [55]:
def sent_vec(sent):
    vector_size = glove.no_components
    wv_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 0
    for w in sent:
        if w in glove.dictionary.keys():
            ctr += 1
            wv_res += glove.word_vectors[glove.dictionary[w]]
    if ctr != 0:
      wv_res = wv_res/ctr
    return wv_res

In [56]:
model = glove.load('glove.model')

In [58]:
#Test for random word in corpus

len(glove.word_vectors[glove.dictionary['document']])

300

In [60]:
df['glovevec'] = df['tokens'].apply(sent_vec)

In [61]:
df.head()

Unnamed: 0,text,tokens,vec,ftvec,glovevec
0,This is the first document.,"[this, be, the, first, document]","[-0.004888693866087123, 0.0035039977374253795,...","[4.430147091625258e-05, -0.0003144395130220801...","[-0.0006408149211444031, -0.000458841898599850..."
1,This document is the second document.,"[this, document, be, the, second, document]","[-0.004569856445111024, 0.004243081547125864, ...","[0.00010233153564816651, -0.000416408865324532...","[-0.0002199440420241763, -0.000394839864618725..."
2,And this is the third one.,"[and, this, be, the, third, one]","[-0.0015325381294436131, 0.0015453843322272103...","[0.00018911650962157486, -0.000287572678644210...","[-0.0001180830776196306, 6.219242241239634e-05..."
3,Is this the first document?,"[be, this, the, first, document]","[-0.004888693866087123, 0.0035039977374253795,...","[4.430147091625258e-05, -0.0003144395130220801...","[-0.0006408149211444031, -0.000458841898599850..."
