In [38]:
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec
import pandas as pd
import re
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras import backend as K

# Word2vec

In [14]:
Train = pd.read_csv('SentimentAnalysis/train.csv')

In [15]:
Train.dropna(inplace=True)

In [16]:
X_train = Train.text

In [17]:
def clean(data):
    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)

    # Remove new line characters
    data = re.sub('\s+', ' ', data)

    # Remove distracting single quotes
    data = re.sub("\'", "", data)
        
    return data

In [22]:
data = []
data_to_list = X_train.values.tolist()
for i in range(len(data_to_list)):
    for i in sent_tokenize(clean(data_to_list[i])):
        temp = []

        # tokenize the sentence into words
        for j in word_tokenize(i):
            temp.append(j.lower())
    data.append(temp)

In [23]:
data

[['i', '`', 'd', 'have', 'responded', ',', 'if', 'i', 'were', 'going'],
 ['!'],
 ['my', 'boss', 'is', 'bullying', 'me', '...'],
 ['leave', 'me', 'alone'],
 ['sons',
  'of',
  '****',
  ',',
  'why',
  'couldn',
  '`',
  't',
  'they',
  'put',
  'them',
  'on',
  'the',
  'releases',
  'we',
  'already',
  'bought'],
 ['-',
  'some',
  'shameless',
  'plugging',
  'for',
  'the',
  'best',
  'rangers',
  'forum',
  'on',
  'earth'],
 ['2am',
  'feedings',
  'for',
  'the',
  'baby',
  'are',
  'fun',
  'when',
  'he',
  'is',
  'all',
  'smiles',
  'and',
  'coos'],
 ['soooo', 'high'],
 ['both', 'of', 'you'],
 ['hehe', '...', '(', 'is', 'that', 'possible', '!', '?', ')'],
 ['as',
  'much',
  'as',
  'i',
  'love',
  'to',
  'be',
  'hopeful',
  ',',
  'i',
  'reckon',
  'the',
  'chances',
  'are',
  'minimal',
  '=p',
  'i',
  '`',
  'm',
  'never',
  'gon',
  'na',
  'get',
  'my',
  'cake',
  'and',
  'stuff'],
 ['i',
  'really',
  'really',
  'like',
  'the',
  'song',
  'love',
  

In [24]:
model_cbow = gensim.models.Word2Vec(
    data,
    min_count = 1,
    size = 100,
    window = 5
)

In [32]:
print("Cosine similarity between 'funny' " + 
               "and 'awesome' - CBOW : ",
    model_cbow.similarity('funny', 'wonderland'))
      
print("Cosine similarity between 'sad' " +
                 "and 'bad' - CBOW : ",
      model_cbow.similarity('sad', 'bad'))

Cosine similarity between 'funny' and 'awesome' - CBOW :  0.5749547
Cosine similarity between 'sad' and 'bad' - CBOW :  0.9739759


  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


In [27]:
model_sg = gensim.models.Word2Vec(
    data,
    min_count = 1,
    size = 100,
    window = 5,
    sg = 1
)

In [30]:
print("Cosine similarity between 'funny' " + 
               "and 'awesome' - Skip gram : ",
    model_sg.similarity('funny', 'wonderland'))
      
print("Cosine similarity between 'sad' " +
                 "and 'bad' - Skip gram : ",
      model_sg.similarity('sad', 'bad'))

Cosine similarity between 'funny' and 'awesome' - Skip gram :  0.75168175
Cosine similarity between 'sad' and 'bad' - Skip gram :  0.89356714


  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


In [37]:
model_sg.wv.vectors

array([[-0.09886577,  0.40992126, -0.2907177 , ...,  0.20723571,
        -0.6223622 ,  0.14726076],
       [-0.37167954,  0.6341782 , -0.14609143, ...,  0.06368072,
        -0.47070983,  0.33830035],
       [-0.3543362 ,  0.298599  , -0.09627043, ...,  0.0652901 ,
        -0.5178383 ,  0.20937568],
       ...,
       [-0.02685356,  0.05414843,  0.04435802, ..., -0.00264727,
        -0.05130376,  0.02970337],
       [-0.02750463,  0.05693062,  0.0659005 , ..., -0.02529236,
        -0.02657408,  0.02523087],
       [-0.01193203,  0.03166804,  0.0427084 , ..., -0.01946358,
        -0.01451666,  0.0212598 ]], dtype=float32)

In [43]:
model_sg.wv.vector_size

100

In [44]:
max_words = 5000
model = Sequential()
model.add(layers.Embedding(
    len(model_sg.wv.vocab),
    model_sg.wv.vector_size,
    weights=[model_sg.wv.vectors],
    trainable=False
))
model.add(layers.Bidirectional(layers.LSTM(50,return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(50)))
model.add(layers.Dense(3,activation='softmax'))

In [45]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         2117500   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 100)         60400     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100)               60400     
_________________________________________________________________
dense (Dense)                (None, 3)                 303       
Total params: 2,238,603
Trainable params: 121,103
Non-trainable params: 2,117,500
_________________________________________________________________


# Utilizare embedding-uri GloVe

https://nlp.stanford.edu/projects/glove/
https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html