Why to represent Texts into Vectors?

Ans : Any kind of machine learning, deep learning or statistical learning platform understands only numbers.

How to represent Texts into Vectors?

1. One Hot Encoding
2. Bag of Words Model
3. TF-IDF
4. Word2Vec
5. FastText
6. GLOve
7. BERT

References

https://www.analyticsvidhya.com/blog/2023/07/step-by-step-guide-to-word2vec-with-gensim/

https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html

https://radimrehurek.com/gensim/models/word2vec.html

https://analyticsindiamag.com/word2vec-vs-glove-a-comparative-guide-to-word-embedding-techniques/

https://medium.com/intelligentmachines/word-embedding-and-one-hot-encoding-ad17b4bbe111



# Install Libraries

In [None]:
!pip install --upgrade gensim -qqq
!pip install scikit-learn -qqq
!pip install swifter -qqq
!pip install glove-python3 -qqq
!pip install spacy -qqq

# Prepare Data

In [None]:
import pandas as pd

train = pd.read_csv("train_sample.csv", on_bad_lines = 'skip')
#test = pd.read_csv("test.csv", on_bad_lines = 'skip')

In [None]:
#from sklearn.utils import shuffle
#train = shuffle(train)

# print(train.shape)
# train = train.sample(frac = 0.01)
# print(train.shape)

# print(test.shape)
# test = test.sample(frac = 0.01)
# print(test.shape)

In [None]:
import spacy
import string

nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

In [None]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):

    doc = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() for word in doc ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    return mytokens


In [None]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):

    doc = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() for word in doc ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    return " ".join(mytokens)


In [None]:
train['tokens'] = train['comment_text'].apply(spacy_tokenizer)
test['tokens'] = test['comment_text'].apply(spacy_tokenizer)

In [None]:
train['sent'] = train['tokens'].str.join(' ')
test['sent'] = test['tokens'].str.join(' ')

# Vectorize Data

1. Bag Of Words Model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_vectors= vectorizer.fit_transform(train['sent'])
X_test_vectors= vectorizer.transform(test['sent'])

print(X_train_vectors.shape)
print(X_test_vectors.shape)

In [None]:
len(vectorizer.get_feature_names_out())

3. TF-IDF Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(train['sent'])
X_test_vectors = vectorizer.transform(test['sent'])

print(X_train_vectors.shape)
print(X_test_vectors.shape)

In [None]:
len(vectorizer.get_feature_names_out())

4. Word2Vec

In [None]:
import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

In [None]:
def wv_sent_vec(sent, model):

    vector_size = model.wv.vector_size
    wv_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 0
    for w in sent:
        if w in model.wv: #model.wv.index_to_key
            ctr += 1
            wv_res += model.wv[w]
    if ctr != 0:
      wv_res = wv_res/ctr
    else:
      wv_res = wv_res
    return wv_res

In [None]:
# w2v_model = Word2Vec(min_count=1,
#                      window=2,
#                      vector_size=300,
#                      sample=6e-5,
#                      alpha=0.03,
#                      min_alpha=0.0007,
#                      negative=20,
#                      workers=4)

In [None]:
# w2v_model.build_vocab(df['tokens']) #, progress_per=10000

In [None]:
# w2v_model.train(df['tokens'], total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

In [None]:
import gensim.models

model = gensim.models.Word2Vec(sentences = train['tokens'], min_count = 5, compute_loss = True)
print(model.get_latest_training_loss())

model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")

In [None]:
train['wv_vec'] = train['tokens'].apply(wv_sent_vec, args = (model,))
test['wv_vec'] = test['tokens'].apply(wv_sent_vec, args = (model,))

In [None]:
len(train['wv_vec'].values[0])

5. FastText

In [None]:
from gensim.models import FastText


embedding_size = 300
window_size = 5
#min_word = 5
#down_sampling = 1e-2
model = FastText(vector_size=embedding_size, window=window_size, min_count=1, sentences=train['tokens'], epochs=1)

In [None]:
# from gensim.test.utils import get_tmpfile
# model.save(get_tmpfile("fasttext.model"))
# model = FastText.load(get_tmpfile("fasttext.model"))

In [None]:
model.save("fasttext.model")
model = FastText.load("fasttext.model")

In [None]:
train['ft_vec'] = train['tokens'].apply(wv_sent_vec, args = (model,))
test['ft_vec'] = test['tokens'].apply(wv_sent_vec, args = (model,))

In [None]:
len(train['ft_vec'].values[0])

6. GLOve

In [None]:
from glove import Corpus, Glove

#Creating a corpus object
corpus = Corpus()

#Training the corpus to generate the co-occurrence matrix which is used in GloVe
corpus.fit(list(train['tokens']), window=10)

glove = Glove(no_components=300, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=5, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')

In [None]:
def gl_sent_vec(sent):
    vector_size = glove.no_components
    wv_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 0
    for w in sent:
        if w in glove.dictionary.keys():
            ctr += 1
            wv_res += glove.word_vectors[glove.dictionary[w]]
    if ctr != 0:
      wv_res = wv_res/ctr
    return wv_res

In [None]:
model = glove.load('glove.model')

In [None]:
#Test for random word in corpus

len(glove.word_vectors[glove.dictionary['document']])

In [None]:
train['glovevec'] = train['tokens'].apply(gl_sent_vec)
test['glovevec'] = test['tokens'].apply(gl_sent_vec)

4. BERT

In [None]:
!pip install transformers[sentencepiece] datasets -qqq

In [None]:
from google.colab import userdata
userdata.get('HF_TOKEN')

In [None]:
from transformers import AutoTokenizer, AutoModel

checkpoint = 'sentence-transformers/all-MiniLM-L6-v2'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

In [None]:
X_train_tokens = tokenizer(list(train['comment_text']),padding=True, truncation=True, return_tensors="pt")
#X_test_tokens = tokenizer(list(test['comment_text']),padding=True, truncation=True, return_tensors="pt")

In [None]:
X_train_embeddings = model(**X_train_tokens)

In [None]:
X_train_embeddings.shape

5. SBERT

In [None]:
!pip install sentence-transformers -qqq

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
!pip install swifter -qqq

In [None]:
train['tokenize'] = train['comment_text'].apply(spacy_tokenizer)

In [None]:
train['embeddings'] = train['tokenize'].apply(model.encode)

In [None]:
len(train['embeddings'].values[0])