## Learning Text Representations

In [1]:
import numpy as np

In [2]:
def softmax(u):
    exp_u = np.exp(u - np.max(u))  # For numerical stability
    return exp_u / np.sum(exp_u)


In [3]:
def Single_context_CBOW(x, label, W1, W2, lr, loss):

    # Forward propagation
    h = np.dot(W1.T, x)
    u = np.dot(W2.T, h)
    y_pred = softmax(u)

    # error
    e = -label + y_pred

    # Backward propagation
    dW2 = np.outer(h, e)
    dW1 = np.outer(x, np.dot(W2, e))

    # Update weights
    W1 = W1 - lr * dW1
    W2 = W2 - lr * dW2

    # Loss function
    loss += -float(u[label == 1]) + np.log(np.sum(np.exp(u)))

    return W1, W2, loss

In [4]:
# Example
# Vocabulary size and embedding dimensions
vocab_size = 6
embedding_dim = 4

# Initialize random one-hot encoded input and label
x = np.zeros((vocab_size, 1))
x[2] = 1  # Example input word index

label = np.zeros((vocab_size, 1))
label[3] = 1  # Example target word index

# Initialize weight matrices and learning rate
W1 = np.random.rand(vocab_size, embedding_dim)
W2 = np.random.rand(embedding_dim, vocab_size)
lr = 0.01
loss = 0

# Training
W1, W2, loss = Single_context_CBOW(x, label, W1, W2, lr, loss)

print("Updated W1:\n", W1)
print("Updated W2:\n", W2)
print("Loss:", loss)

Updated W1:
 [[0.84088415 0.45568162 0.32715687 0.97967392]
 [0.56972814 0.03161001 0.65794655 0.35959826]
 [0.14239547 0.95862406 0.61828065 0.45492024]
 [0.9858003  0.44781334 0.02205796 0.4643884 ]
 [0.1564013  0.33227212 0.81369239 0.43698621]
 [0.28473761 0.87718824 0.32072009 0.32393398]]
Updated W2:
 [[0.44801547 0.25285078 0.24883883 0.1433884  0.25427217 0.12103724]
 [0.63404782 0.42656877 0.27490643 0.87256287 0.21310852 0.46287194]
 [0.12216394 0.21627667 0.32798213 0.08692437 0.76880657 0.50842412]
 [0.69652933 0.29758073 0.62016249 0.05484967 0.73324612 0.43147069]]
Loss: 1.801166559747447


  loss += -float(u[label == 1]) + np.log(np.sum(np.exp(u)))


**Building the word2vec model using gensim**

In [5]:
import warnings
import nltk

warnings.filterwarnings('ignore')

# Data processing
import pandas as pd
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
stopWords = stopwords.words('english')

# Modeling
from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models.phrases import Phraser

[nltk_data] Downloading package stopwords to C:\Users\L E G I O
[nltk_data]     N\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Loading the dataset**

In [6]:
data = pd.read_csv('text.csv',header=None)

In [7]:
data.head()

Unnamed: 0,0
0,room kind clean strong smell dogs. generally a...
1,stayed crown plaza april april . staff friendl...
2,booked hotel hotwire lowest price could find. ...
3,stayed husband sons way alaska cruise. loved h...
4,girlfriends stayed celebrate th birthdays. pla...


**Preprocessing and preparing the dataset**

In [8]:
def pre_process(text):

    #convert to lowercase
    text = str(text).lower()

    #remove all special characters and keep only alpha numeric characters and spaces
    text = re.sub(r'[^A-Za-z0-9\s.]',r'',text)

    #remove new lines
    text = re.sub(r'\n',r' ',text)

    # remove stop words
    text = " ".join([word for word in text.split() if word not in stopWords])

    return text

In [9]:
pre_process(data[0][50])

'agree fancy. everything needed. breakfast pool hot tub nice shuttle airport later checkout time. noise issue tough sleep through. awhile forget noisy door nearby noisy guests. complained management later email credit compd us amount requested would return.'

In [10]:
data[0] = data[0].map(lambda x: pre_process(x))

In [11]:
data[0][1].split(".")[:5]

['stayed crown plaza april april ',
 ' staff friendly attentive',
 ' elevators tiny ',
 ' food restaurant delicious priced little high side',
 ' course washington dc']

In [12]:
corpus = []
for line in data[0][1].split('.'):
    words = [x for x in line.split()]
    corpus.append(words)

In [13]:
corpus[:2]

[['stayed', 'crown', 'plaza', 'april', 'april'],
 ['staff', 'friendly', 'attentive']]

In [14]:
data = data[0].map(lambda x: x.split('.'))

corpus = []
for i in (range(len(data))):
    for line in data[i]:
        words = [x for x in line.split()]
        corpus.append(words)

corpus[:2]

[['room', 'kind', 'clean', 'strong', 'smell', 'dogs'],
 ['generally', 'average', 'ok', 'overnight', 'stay', 'youre', 'fussy']]

In [15]:
phrases = Phrases(sentences=corpus,min_count=25,threshold=50)
bigram = Phraser(phrases)

In [16]:
corpus[100]

['wonderful',
 'staff',
 'great',
 'location',
 'definately',
 'price',
 'high',
 'standard',
 'hotel']

In [17]:
corpus[9]

['course', 'washington', 'dc']

## Building the model

In [18]:
size = 100
window_size = 2
epochs = 100
min_count = 2
workers = 4
sg = 1

In [19]:
model = Word2Vec(corpus, sg=1, window=window_size, vector_size=size, min_count=min_count, workers=workers, epochs=epochs)

In [20]:
model.save('word2vec.model')

In [22]:
model = Word2Vec.load('word2vec.model')

## Evaluating the embeddings

In [23]:
print('san_diego' in model.wv.key_to_index)
print('san diego' in model.wv.key_to_index)
print('San_Diego' in model.wv.key_to_index)

False
False
False


In [27]:
model.wv.most_similar('sandiego')

[('diego', 0.6002644896507263),
 ('san', 0.5842916369438171),
 ('coastline', 0.5682160258293152),
 ('carless', 0.5434173941612244),
 ('field', 0.5417526364326477),
 ('locationeasy', 0.5396423935890198),
 ('sausilito', 0.5394237041473389),
 ('mustsee', 0.5366553068161011),
 ('dallas', 0.5347569584846497),
 ('haightashbury', 0.5316208600997925)]

In [28]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

[('queen', 0.6996593475341797)]

In [29]:
text = ['los_angeles','indianapolis', 'holiday', 'san_antonio','new_york']

model.wv.doesnt_match(text)

'holiday'

## Visualizing word embeddings in TensorBoard

In [30]:
import warnings
warnings.filterwarnings(action='ignore')


import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.compat.v1.disable_eager_execution()

tf.compat.v1.reset_default_graph()
from tensorboard.plugins import projector
import numpy as np
import gensim
import os




In [31]:
max_size = len(model.wv.index_to_key) - 1

In [32]:
w2v = np.zeros((max_size, model.vector_size))

In [34]:
if not os.path.exists('projections'):
    os.makedirs('projections')

with open("projections/metadata.tsv", 'w+') as file_metadata:

    for i, word in enumerate(model.wv.index_to_key[:max_size]):

        # store the embeddings of the word
        w2v[i] = model.wv[word]

        # write the word to a file
        file_metadata.write(word + '\n')

In [35]:
sess = tf.compat.v1.InteractiveSession()

In [36]:
with tf.device("/cpu:0"):
    embedding = tf.Variable(w2v, trainable=False, name='embedding')

In [37]:
tf.compat.v1.global_variables_initializer().run()

In [38]:
saver = tf.compat.v1.train.Saver()

In [40]:
writer = tf.compat.v1.summary.FileWriter('projections', sess.graph)

In [41]:
config = projector.ProjectorConfig()
embed= config.embeddings.add()

In [42]:
embed.tensor_name = 'embedding'
embed.metadata_path = 'metadata.tsv'

In [43]:
projector.visualize_embeddings(writer, config)

saver.save(sess, 'projections/model.ckpt', global_step=max_size)

'projections/model.ckpt-27330'

## Finding similar documents using doc2vec

In [44]:
import warnings
warnings.filterwarnings('ignore')

import os
import gensim
from gensim.models.doc2vec import TaggedDocument

from nltk import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'\w+')
stopWords = set(stopwords.words('english'))

In [45]:
docLabels = []
docLabels = [f for f in os.listdir('news_dataset') if  f.endswith('.txt')]

data = []
for doc in docLabels:
    data.append(open('news_dataset/'+doc, encoding='utf-8', errors='ignore').read())

In [46]:
docLabels[:5]

['Electronics_0.txt',
 'Electronics_1.txt',
 'Electronics_10.txt',
 'Electronics_100.txt',
 'Electronics_101.txt']

In [47]:
class DocIterator(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list

    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield TaggedDocument(words=doc.split(), tags=[self.labels_list[idx]])

In [48]:
it = DocIterator(data, docLabels)

In [49]:
size = 100
alpha = 0.025
min_alpha = 0.025
dm = 1
min_count = 1

In [50]:
model = gensim.models.Doc2Vec(vector_size=size, min_count=min_count, alpha=alpha, min_alpha=min_alpha, dm=dm)
model.build_vocab(it)

In [51]:
for epoch in range(100):
    model.train(it, total_examples=model.corpus_count, epochs=1)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

In [52]:
model.save('doc2vec.model')

In [53]:
d2v_model = gensim.models.doc2vec.Doc2Vec.load('doc2vec.model')

In [54]:
model.docvecs.most_similar('Electronics_666.txt')

[('Sports_381.txt', 0.8735464811325073),
 ('Science_344.txt', 0.8635656833648682),
 ('Sports_635.txt', 0.8628700375556946),
 ('Science_722.txt', 0.8572390079498291),
 ('Science_366.txt', 0.8502411842346191),
 ('Politics_872.txt', 0.8497423529624939),
 ('Politics_578.txt', 0.8470869064331055),
 ('Science_725.txt', 0.8466042280197144),
 ('Politics_476.txt', 0.8451502919197083),
 ('Electronics_463.txt', 0.8404396176338196)]