Word Embedding techniques 

In [1]:
import pandas as pd
import os
import re 
import spacy 
from spacy.lang.en.stop_words import STOP_WORDS
import torch
import torchtext
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from torchtext.vocab import GloVe
from torchtext.vocab import Vectors
from transformers import BertTokenizer,BertModel
from tqdm import tqdm
import nltk

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("nlp_sentences_50.csv")

In [3]:
df['text'].head()

0        Natural Language Processing is evolving fast.
1       NLP helps computers understand human language.
2            Text classification is a key task in NLP.
3    Tokenization is the first step in most NLP pip...
4           Stemming reduces words to their root form.
Name: text, dtype: object

In [4]:
nlp = spacy.load("en_core_web_sm")

if 'text' not in df.columns:
    raise ValueError("The DataFrame does not contain a 'text' column.")

def clean_and_tokenize(text):
    cleaned = re.sub(r'[^a-zA-Z0-9\s]', ' ', str(text).lower())
    doc = nlp(cleaned)
    return [token.text for token in doc if not token.is_space and token.text not in STOP_WORDS]

def clean_and_tokenize_pos(text):
    cleaned = re.sub(r'^a-zA-Z',' ',str(text).lower())
    doc = nlp(cleaned)
    return [(token.text,token.pos_) for token in doc if not token.is_space and token.text not in STOP_WORDS]
df.head()

Unnamed: 0,text
0,Natural Language Processing is evolving fast.
1,NLP helps computers understand human language.
2,Text classification is a key task in NLP.
3,Tokenization is the first step in most NLP pip...
4,Stemming reduces words to their root form.


In [5]:
df['tokens'] = df['text'].apply(clean_and_tokenize)
df['POS_Tag'] = df['text'].apply(clean_and_tokenize_pos)
df.head()

Unnamed: 0,text,tokens,POS_Tag
0,Natural Language Processing is evolving fast.,"[natural, language, processing, evolving, fast]","[(natural, ADJ), (language, NOUN), (processing..."
1,NLP helps computers understand human language.,"[nlp, helps, computers, understand, human, lan...","[(nlp, PROPN), (helps, VERB), (computers, NOUN..."
2,Text classification is a key task in NLP.,"[text, classification, key, task, nlp]","[(text, NOUN), (classification, NOUN), (key, A..."
3,Tokenization is the first step in most NLP pip...,"[tokenization, step, nlp, pipelines]","[(tokenization, NOUN), (step, NOUN), (nlp, PRO..."
4,Stemming reduces words to their root form.,"[stemming, reduces, words, root, form]","[(stemming, NOUN), (reduces, VERB), (words, NO..."


In [6]:
## ONE HOT ENCODING


mlb = MultiLabelBinarizer()
one_hot = mlb.fit_transform(df['tokens'])

one_hot_df = pd.DataFrame(one_hot,columns = mlb.classes_)
one_hot_df.head()

Unnamed: 0,accuracy,activation,affects,ai,algorithm,algorithms,analysis,artificial,autoencoders,automate,...,understand,understanding,unsupervised,uses,validation,values,visualization,word,words,world
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [7]:
## BAG OF WORDS 

text = [' '.join(tokens) for tokens in df['tokens']]
vec = CountVectorizer()
bow = vec.fit_transform(text)
bow_df = pd.DataFrame(bow.toarray(),columns = vec.get_feature_names_out())
bow_df.head()

Unnamed: 0,accuracy,activation,affects,ai,algorithm,algorithms,analysis,artificial,autoencoders,automate,...,understand,understanding,unsupervised,uses,validation,values,visualization,word,words,world
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [8]:
tfidf = TfidfVectorizer()
tfidf_mat = tfidf.fit_transform(text)
tfidf_df = pd.DataFrame(tfidf_mat.toarray(),columns = tfidf.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,accuracy,activation,affects,ai,algorithm,algorithms,analysis,artificial,autoencoders,automate,...,understand,understanding,unsupervised,uses,validation,values,visualization,word,words,world
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.438783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0


In [9]:

glove = Vectors(
    name='glove.6B.100d.txt',
    cache='C:/Users/Kousimon/Downloads/glove.6B'
)

In [10]:
print(glove['processing'])

tensor([-0.0817,  0.7159, -0.2068,  0.0296,  0.2303, -1.1452, -0.2969,  0.7235,
         0.4237,  0.4391, -0.1736, -0.3742, -0.1118, -0.0814,  0.1779,  0.1807,
         0.6416,  0.0766,  0.7338,  0.0568, -0.5870,  0.1979,  0.5395, -0.0841,
        -0.3598,  0.0995,  0.4409,  0.4065, -0.2734,  0.4093, -0.5441,  0.3249,
        -0.3346, -0.4434, -0.0139, -0.4158, -0.0713, -0.1875, -0.0526, -1.3428,
         0.1466, -1.4384, -0.3564,  0.0761, -0.0038,  0.2221,  0.0268, -0.4826,
         0.1124, -0.2780,  0.3645, -0.1440, -0.1794,  0.9890, -0.1420,  0.0271,
        -0.6365, -0.6784,  2.2571, -0.0224,  0.1442,  0.1008,  0.5046, -0.2944,
         0.2433, -0.0502,  0.3158, -0.4580,  0.6398, -0.0378, -0.7193,  0.6936,
         0.4196, -0.1840,  0.6345,  0.4304, -0.3149,  0.1866, -0.4996,  0.6993,
         0.8549,  0.3138, -1.0777,  0.5603, -1.8867,  0.7728,  1.3367, -0.7468,
        -0.0777, -0.2813, -0.0960,  0.0701, -0.1414, -0.0486,  0.5042, -0.2826,
         0.2881, -0.6854,  1.3961, -0.06

In [11]:
df['glove_vectors'] = df['tokens'].apply(lambda x: [glove[word] for word in x if word in glove.stoi])

In [18]:
df.head()

Unnamed: 0,text,tokens,POS_Tag,glove_vectors
0,Natural Language Processing is evolving fast.,"[natural, language, processing, evolving, fast]","[(natural, ADJ), (language, NOUN), (processing...","[[tensor(0.4399), tensor(1.1951), tensor(0.702..."
1,NLP helps computers understand human language.,"[nlp, helps, computers, understand, human, lan...","[(nlp, PROPN), (helps, VERB), (computers, NOUN...","[[tensor(-0.4242), tensor(1.1379), tensor(-0.5..."
2,Text classification is a key task in NLP.,"[text, classification, key, task, nlp]","[(text, NOUN), (classification, NOUN), (key, A...","[[tensor(-0.4970), tensor(0.7164), tensor(0.40..."
3,Tokenization is the first step in most NLP pip...,"[tokenization, step, nlp, pipelines]","[(tokenization, NOUN), (step, NOUN), (nlp, PRO...","[[tensor(-0.3844), tensor(-0.1395), tensor(-0...."
4,Stemming reduces words to their root form.,"[stemming, reduces, words, root, form]","[(stemming, NOUN), (reduces, VERB), (words, NO...","[[tensor(0.7884), tensor(-0.2981), tensor(0.21..."


In [47]:
# Example dummy binary label: Even index -> class 0, Odd index -> class 1
df['label'] = df.index % 2

In [48]:
df.head()

Unnamed: 0,text,tokens,POS_Tag,glove_vectors,glove_avg,label
0,Natural Language Processing is evolving fast.,"[natural, language, processing, evolving, fast]","[(natural, ADJ), (language, NOUN), (processing...","[[tensor(0.4399), tensor(1.1951), tensor(0.702...","[-0.003268391, 0.592868, 0.33463138, 0.2203791...",0
1,NLP helps computers understand human language.,"[nlp, helps, computers, understand, human, lan...","[(nlp, PROPN), (helps, VERB), (computers, NOUN...","[[tensor(-0.4242), tensor(1.1379), tensor(-0.5...","[-0.031687822, 0.58213, 0.24518669, 0.05667926...",1
2,Text classification is a key task in NLP.,"[text, classification, key, task, nlp]","[(text, NOUN), (classification, NOUN), (key, A...","[[tensor(-0.4970), tensor(0.7164), tensor(0.40...","[-0.43756, 0.5182328, 0.295148, -0.0667942, 0....",0
3,Tokenization is the first step in most NLP pip...,"[tokenization, step, nlp, pipelines]","[(tokenization, NOUN), (step, NOUN), (nlp, PRO...","[[tensor(-0.3844), tensor(-0.1395), tensor(-0....","[-0.04940425, 0.436995, -0.203637, -0.09001399...",1
4,Stemming reduces words to their root form.,"[stemming, reduces, words, root, form]","[(stemming, NOUN), (reduces, VERB), (words, NO...","[[tensor(0.7884), tensor(-0.2981), tensor(0.21...","[-0.17456861, 0.48956004, 0.07702799, 0.225233...",0


In [60]:
tokenzier = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [14]:
sentence2 = text[0]
inputs = tokenzier(sentence2,return_tensors = 'pt')
output = model(**inputs)

In [40]:
embeddings = output.last_hidden_state
embeddings.shape

torch.Size([1, 7, 768])

In [16]:
## BART BIDIRECTIONAL AUTO REGRESSIVE TRANSFORMER 