<a href="https://colab.research.google.com/github/SomdeepAcharyya/NLP/blob/main/Word_Embeddings_Glove_and_Elmo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Implementation of Glove and ELMO word embedding models 

## Headers

In [None]:
!pip install nltk
import nltk
nltk.download('punkt')
!pip install gensim
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
  
warnings.filterwarnings(action = 'ignore')

In [None]:
import gensim
from gensim.models import Word2Vec

In [None]:
# Tripadvisor Dataset

path = r'/content/drive/MyDrive/Per_CD_RS/tripadvisor_reviews.csv'
with open(path, encoding="utf-8", errors='ignore') as infile:
  tripadvisor = pd.read_csv(infile)

In [None]:
tripadvisor.columns

Index(['id', 'username', 'type', 'date', 'title', 'text', 'rating',
       'helpfulness', 'total_points', 'taObject', 'taObjectUrl',
       'taObjectCity', 'open', 'cons', 'extra', 'agree', 'neuro',
       'processed_text', 'processed_title'],
      dtype='object')

In [None]:
import numpy as np
lines = np.array(tripadvisor.text)
lines

array(['Decent Hotel next to station so good location for getting round Stockholm - room a little on the small size - due for a refurbishment - but was still comfortable - room was quiet.\n\nBreakfast Buffet good of a morning though not quite as good as the Readisson Waterfront round the corner.\n\nStayed on the 6th floor in a corner room near the lifts and staircase - could smell the chlorine coming from the pool in the basement! in the corrindor though not in the room.\n\nWould recommend it',
       'Excellent Hotel - well situated for getting round stockholm - right by the station so easy access to the Arlanda Express for the airport.\n\nGreat Breakfast Buffet. Rooms overlooking waterfront have great views those at the back look directly into an office block! Free Internet Access!\n\nRooms very modern - beds very comfortable - highly recommended. Being staying here a couple of nights a week for the last 5 months or so.\n\nRestaurant of evening - limited menu but food is good.',
    

In [None]:
data = []

for i in range(len(tripadvisor)):
  x = tripadvisor.text[i]
  
  # iterate through each sentence in the file
  for i in sent_tokenize(x):
      temp = []
        
      # tokenize the sentence into words
      for j in word_tokenize(i):
          temp.append(j.lower())
  
  data.append(temp)

In [None]:
lines = list(tripadvisor.text)
word_tokens = data

In [None]:
from nltk.corpus import stopwords 
nltk.download('stopwords')
stop_words=stopwords.words('english') 
lines_without_stopwords=[] 
for line in lines: 
  stop_removed=[] 
for line in word_tokens:
  for word in line: 
    if word not in stop_words: 
      stop_removed.append(word) 
print(stop_removed)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
nltk.download('wordnet')
from nltk import WordNetLemmatizer  
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer() 
lines_with_lemmas=[] #stop words contain the set of stop words 
for line in lines: 
  lem_line=[] 
for word in stop_removed: 
  lem_line.append(wordnet_lemmatizer.lemmatize(word)) 
string='' 
new_lines=','.join([str(i) for i in lem_line])
print(lem_line)
print(new_lines)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Word2vec

In [None]:
# Word2zvec
model1 = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 5)    #CBOW
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 5, sg = 1)   #Skip grams

In [None]:
print(model1.similarity('fascinating', 'disappointing'))
print(model2.similarity('fascinating', 'disappointing'))

0.84259117
0.8542597


<gensim.models.word2vec.Word2VecVocab at 0x7fa2aefd2750>

# Glove

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
## download the word vectors

In [None]:
import gensim.downloader as api
glove_model = api.load('glove-wiki-gigaword-300')



In [None]:
## vector representation of a word

In [None]:
glove_model["beautiful"]

In [None]:
## word vectors understand the meanings of words

In [None]:
glove_model.most_similar("girl")

In [None]:
## queen - girl + boy = king

In [None]:
glove_model.most_similar(positive=['boy', 'queen'], negative=['girl'], topn=1)

[('king', 0.6770139336585999)]

In [None]:
vocab = ["boy", "girl", "man", "woman", "king", "queen", "banana", "apple", "mango", "fruit", "coconut", "orange"]
def tsne_plot(model):
  labels = []
  wordvecs = []
  for word in vocab:
        wordvecs.append(model[word])
        labels.append(word)
  tsne_model = TSNE(perplexity=3, n_components=2, init='pca', random_state=42)
  coordinates = tsne_model.fit_transform(wordvecs)
  x = []
  y = []
  for value in coordinates:
        x.append(value[0])
        y.append(value[1])
  plt.figure(figsize=(8,8))
  for i in range(len(x)):
            plt.scatter(x[i],y[i])
            plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(2, 2), textcoords='offset points', ha='right', va='bottom')
  plt.show()
  
tsne_plot(glove_model)

# Elmo

In [None]:
!pip install "tensorflow>=2.0.0"
!pip install --upgrade tensorflow-hub

In [None]:
# import necessary libraries
import tensorflow_hub as hub
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
  
# Load pre trained ELMo model
elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)
  
# create an instance of ELMo
embeddings = elmo(
    [
        "I love to watch TV",
        "I am wearing a wrist watch"
    ],
    signature="default",
    as_dict=True)["elmo"]
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)

In [None]:
# Print word embeddings for word WATCH in given two sentences
print('Word embeddings for word WATCH in first sentence')
print(sess.run(embeddings[0][3]))
print('Word embeddings for word WATCH in second sentence')
print(sess.run(embeddings[1][5]))

Word embeddings for word WATCH in first sentence
[ 0.14079645 -0.15788531 -0.00950466 ...  0.4300597  -0.52887094
  0.06327899]
Word embeddings for word WATCH in second sentence
[-0.08213335  0.01050366 -0.01454147 ...  0.48705393 -0.54457957
  0.5262399 ]


In [None]:
embeddings[0][3]

<tf.Tensor 'strided_slice_5:0' shape=(1024,) dtype=float32>