## Learning Text Representations


In [None]:
import numpy as np

## Softmax


In [None]:
def softmax(u):
    exp_u = np.exp(u - np.max(u))
    return exp_u / np.sum(exp_u)

## Single Context CBOW


In [None]:
def Single_context_CBOW(x, label, W1, W2, lr, loss):

    # Forward propagation
    h = np.dot(W1.T, x)
    u = np.dot(W2.T, h)
    y_pred = softmax(u)

    # error
    e = -label + y_pred

    # Backward propagation
    dW2 = np.outer(h, e)
    dW1 = np.outer(x, np.dot(W2, e))

    # Update weights
    W1 = W1 - lr * dW1
    W2 = W2 - lr * dW2

    # Loss function
    loss += -float(u[label == 1]) + np.log(np.sum(np.exp(u)))

    return W1, W2, loss

## Vocab Size and embedding dimensions


In [None]:
# Vocabulary size and embedding dimensions
vocab_size = 6
embedding_dim = 4

# Initialize random one-hot encoded input and label
x = np.zeros((vocab_size, 1))
x[2] = 1  # Example input word index

label = np.zeros((vocab_size, 1))
label[3] = 1  # Example target word index

# Initialize weight matrices and learning rate
W1 = np.random.rand(vocab_size, embedding_dim)
W2 = np.random.rand(embedding_dim, vocab_size)
lr = 0.01
loss = 0

# Training
W1, W2, loss = Single_context_CBOW(x, label, W1, W2, lr, loss)

print("Updated W1:\n", W1)
print("Updated W2:\n", W2)
print("Loss:", loss)

Updated W1:
 [[0.33949013 0.23943481 0.7622218  0.17442242]
 [0.63267885 0.77316103 0.70096786 0.8586162 ]
 [0.62332115 0.68794873 0.94913127 0.93295125]
 [0.99465627 0.99016618 0.00751418 0.08912635]
 [0.0136116  0.24361729 0.2937138  0.77876469]
 [0.36391275 0.8738911  0.68047388 0.81120983]]
Updated W2:
 [[0.83039455 0.04042438 0.11784894 0.18535095 0.43211524 0.51162481]
 [0.06214226 0.58996491 0.58299468 0.55541173 0.56423794 0.95482159]
 [0.90086651 0.29231934 0.72136398 0.02665192 0.51166888 0.42455776]
 [0.02313775 0.45507954 0.6852997  0.70280401 0.95928093 0.84953319]]
Loss: 2.3487080242341025


  loss += -float(u[label == 1]) + np.log(np.sum(np.exp(u)))


# Building the word2vec model using gensim


In [None]:
import warnings
warnings.filterwarnings('ignore')

# Data processing
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stopWords = stopwords.words('english')

# Modeling
from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models.phrases import Phraser

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Loading the dataset


In [None]:
data = pd.read_csv('/content/text.csv',header=None)

In [None]:
data.head()

Unnamed: 0,0
0,room kind clean strong smell dogs. generally a...
1,stayed crown plaza april april . staff friendl...
2,booked hotel hotwire lowest price could find. ...
3,stayed husband sons way alaska cruise. loved h...
4,girlfriends stayed celebrate th birthdays. pla...


**Preprocessing and preparing the dataset**


In [None]:
def pre_process(text):
    text = str(text).lower()
    text = re.sub(r'[^A-Za-z0-9\s.]',r'',text)
    text = re.sub(r'\n',r' ',text)
    text = " ".join([word for word in text.split() if word not in stopWords])
    return text

In [None]:
pre_process(data[0][50])

'agree fancy. everything needed. breakfast pool hot tub nice shuttle airport later checkout time. noise issue tough sleep through. awhile forget noisy door nearby noisy guests. complained management later email credit compd us amount requested would return.'

In [None]:
data[0] = data[0].map(lambda x: pre_process(x))

In [None]:
data[0][1].split(".")[:5]

['stayed crown plaza april april ',
 ' staff friendly attentive',
 ' elevators tiny ',
 ' food restaurant delicious priced little high side',
 ' course washington dc']

In [None]:
corpus = []
for line in data[0][1].split('.'):
    words = [x for x in line.split()]
    corpus.append(words)

In [None]:
corpus[:2]

[['stayed', 'crown', 'plaza', 'april', 'april'],
 ['staff', 'friendly', 'attentive']]

In [None]:
data = data[0].map(lambda x: x.split('.'))

corpus = []
for i in (range(len(data))):
    for line in data[i]:
        words = [x for x in line.split()]
        corpus.append(words)

corpus[:2]

[['room', 'kind', 'clean', 'strong', 'smell', 'dogs'],
 ['generally', 'average', 'ok', 'overnight', 'stay', 'youre', 'fussy']]

In [None]:
phrases = Phrases(sentences=corpus,min_count=25,threshold=50)
bigram = Phraser(phrases)

In [None]:
corpus[111]

['connected', 'rivercenter', 'mall', 'downtown', 'san', 'antonio']

In [None]:
corpus[9]

['course', 'washington', 'dc']

## Building the model


In [None]:
size = 100
window_size = 2
epochs = 100
min_count = 2
workers = 4
sg = 1

In [None]:
model = Word2Vec(corpus, sg=1, window=window_size, vector_size=size, min_count=min_count, workers=workers, epochs=epochs)

In [None]:
model.save('/content/word2vec.model')

In [None]:
model = Word2Vec.load('/content/word2vec.model')

## Evaluating the embeddings


In [None]:
print('san_diego' in model.wv.key_to_index)
print('san diego' in model.wv.key_to_index)
print('San_Diego' in model.wv.key_to_index)

False
False
False


In [None]:
model.wv.most_similar('san')

[('diego', 0.9160587787628174),
 ('francisco', 0.9062904715538025),
 ('antonio', 0.8464164137840271),
 ('fran', 0.8159115314483643),
 ('sf', 0.7527428865432739),
 ('dallas', 0.6760269403457642),
 ('la', 0.6679771542549133),
 ('austin', 0.6636949777603149),
 ('citysightseeing', 0.6597621440887451),
 ('seattle', 0.6534638404846191)]

In [None]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

[('queen', 0.6999184489250183)]

In [None]:
text = ['los_angeles','indianapolis', 'holiday', 'san_antonio','new_york']
model.wv.doesnt_match(text)



'holiday'

## Visualizing word embeddings in TensorBoard


In [None]:
import warnings
warnings.filterwarnings(action='ignore')


import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.compat.v1.disable_eager_execution()

tf.compat.v1.reset_default_graph()
from tensorboard.plugins import projector
import numpy as np
import gensim
import os

In [None]:
max_size = len(model.wv.index_to_key) - 1

In [None]:
w2v = np.zeros((max_size, model.vector_size))

In [None]:
if not os.path.exists('projections'):
    os.makedirs('projections')

with open("projections/metadata.tsv", 'w+') as file_metadata:
    for i, word in enumerate(model.wv.index_to_key[:max_size]):
        w2v[i] = model.wv[word]
        file_metadata.write(word + '\n')

In [None]:
sess = tf.compat.v1.InteractiveSession()

In [None]:
with tf.device("/cpu:0"):
    embedding = tf.Variable(w2v, trainable=False, name='embedding')

In [None]:
tf.compat.v1.global_variables_initializer().run()

In [None]:
saver = tf.compat.v1.train.Saver()

In [None]:
writer = tf.compat.v1.summary.FileWriter('projections', sess.graph)

In [None]:
config = projector.ProjectorConfig()
embed= config.embeddings.add()

In [None]:
embed.tensor_name = 'embedding'
embed.metadata_path = 'metadata.tsv'

In [None]:
projector.visualize_embeddings(writer, config)

saver.save(sess, 'projections/model.ckpt', global_step=max_size)

'projections/model.ckpt-27330'

## Finding similar documents using doc2vec


In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import gensim
from gensim.models.doc2vec import TaggedDocument

from nltk import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'\w+')
stopWords = set(stopwords.words('english'))

In [None]:
docLabels = []
docLabels = [f for f in os.listdir('/content/news_dataset') if  f.endswith('.txt')]

data = []
for doc in docLabels:
    data.append(open('/content/news_dataset/'+doc, encoding='utf-8', errors='ignore').read())

In [None]:
docLabels[:5]

['Electronics_669.txt',
 'Electronics_476.txt',
 'Politics_781.txt',
 'Politics_401.txt',
 'Electronics_750.txt']

In [None]:
class DocIterator(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list

    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield TaggedDocument(words=doc.split(), tags=[self.labels_list[idx]])

In [None]:
it = DocIterator(data, docLabels)

In [None]:
size = 100
alpha = 0.025
min_alpha = 0.025
dm = 1
min_count = 1

In [None]:
model = gensim.models.Doc2Vec(vector_size=size, min_count=min_count, alpha=alpha, min_alpha=min_alpha, dm=dm)
model.build_vocab(it)

In [None]:
for epoch in range(100):
    model.train(it, total_examples=model.corpus_count, epochs=1)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

In [None]:
model.save('/content/doc2vec.model')

In [None]:
d2v_model = gensim.models.doc2vec.Doc2Vec.load('/content/doc2vec.model')

In [None]:
model.docvecs.most_similar('Electronics_666.txt')

[('Electronics_905.txt', 0.9707800149917603),
 ('Politics_167.txt', 0.9686378836631775),
 ('Politics_989.txt', 0.9581860899925232),
 ('Politics_37.txt', 0.9514307975769043),
 ('Science_31.txt', 0.9499551653862),
 ('Electronics_132.txt', 0.9492486715316772),
 ('Politics_285.txt', 0.9489672780036926),
 ('Electronics_541.txt', 0.9460381269454956),
 ('Electronics_180.txt', 0.9449853301048279),
 ('Politics_320.txt', 0.9384676218032837)]