In [85]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from nltk.corpus import stopwords
import nltk

from keras.models import Sequential
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)

%matplotlib inline

In [2]:
data =pd.read_csv("C:/Users/mohit/Desktop/NN/sentiment_data_5l.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,sentiment,content
0,1631,hate,My stomach is killing me do j can't sleep
1,8481,anger,@anz_rocks19 i have to break the twitterparty...
2,763,hate,Taking back the HORRIBLE shoes my mum made me ...
3,396,sadness,my stupid tooooth hurts
4,18851,anger,too sick for rigging tomorrow.


In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=0)

# Load Glove 50dim and create word_to_index, index_to_word, word_to_vec_map

In [86]:
word_embedding =open('D:/word_embedding/pre_trained_gloVe/glove.6B/glove.6B.50d.txt',encoding='utf8')

In [87]:
def read_glove_vecs(glovefile):
    word_list = []
    word_to_index = {}
    index_to_word = {}
    word_to_vec_map={}
    index = 0
    for line in glovefile: 
        word =line.split(" ")
        word_vec_list =np.float_(word[1:51])
        word_list.append(word[0])
        word_to_vec_map[word[0]] =word_vec_list
        word_to_index[word[0]] = index
        index = index + 1
        
    index_to_word = dict(zip(word_to_index.values(), word_to_index.keys()))
    
    return (word_to_index, index_to_word, word_to_vec_map, word_list)

def pretrained_embedding_layer1(word_to_vec_map, word_to_index):
    
    vocab_len = len(word_to_vec_map)+1
    emb_dim = 50
    
    matrix = np.zeros((vocab_len, emb_dim))
    
    
    for word, index in word_to_index.items():
        matrix[index, : ] = word_to_vec_map[word]
        
    
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
  
    embedding_layer.build((None,))
    embedding_layer.set_weights([matrix])
    return embedding_layer
    
def sentences_to_indices1(X, word_to_index, max_len):
    m = X.shape[0]
    sentences_indices = np.zeros((m, max_len))
    
    print(sentences_indices.shape)
    
    for i in range(m):
        words = [word.lower() for word in X[i].split() if not word in stop_words]
        #words = [word.lower() for word in X[i].split()]
        
        
        j = 0
        for word in words:
            sentences_indices[i, j] = word_to_index.get(word)
            j += 1
            
    
    return sentences_indices 

def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = [w.lower() for w in X[i].split()]
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            if word_to_index.get(w) is not None:
                X_indices[i, j] = word_to_index[w]
                # Increment j to j + 1
                j += 1
            
    ### END CODE HERE ###
    
    return X_indices

def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

def sentence_max_length(train_data):
    max_len = 0
    for sen in  train_data["content"].values:
        if len(sen.split()) > max_len:
            max_len = len(sen.split())
    return max_len

def Emojify_V2(input_shape, word_to_vec_map, word_to_index):
    
    sentence_indices = Input(input_shape, dtype='int32')
    
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    embeddings = embedding_layer(sentence_indices) 
    
    X = LSTM(128, return_sequences = True)(embeddings)
    X = Dropout(.5)(X)
    X = LSTM(128, return_sequences = False)(X)
    X = Dropout(.5)(X)
    X = Dense(5)(X)
    X = Activation('softmax')(X)
    
    model = Model(inputs=sentence_indices, outputs=X)
    return model
    

In [88]:
word_to_index, index_to_word, word_to_vec_map, word_list = read_glove_vecs(word_embedding)

In [89]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

weights[0][1][3] = 0.40951


In [90]:
X1 = np.array(["funny lol", "lets play baseball", "food is ready for you"])
X1_indices = sentences_to_indices(X1,word_to_index, max_len = 5)
print("X1 =", X1)
print("X1_indices =", X1_indices)

X1 = ['funny lol' 'lets play baseball' 'food is ready for you']
X1_indices = [[5.4660e+03 7.3048e+04 0.0000e+00 0.0000e+00 0.0000e+00]
 [8.2350e+03 2.8200e+02 1.4440e+03 0.0000e+00 0.0000e+00]
 [5.6500e+02 1.4000e+01 1.1880e+03 1.0000e+01 8.1000e+01]]


In [91]:
maxLen = sentence_max_length(train_data)
maxLen

32

In [92]:
X = train_data["content"]
X[0]

"My stomach is killing me do j can't sleep"

In [93]:
model = Emojify_V2((maxLen,), word_to_vec_map, word_to_index)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 32)                0         
_________________________________________________________________
embedding_15 (Embedding)     (None, 32, 50)            20000050  
_________________________________________________________________
lstm_9 (LSTM)                (None, 32, 128)           91648     
_________________________________________________________________
dropout_9 (Dropout)          (None, 32, 128)           0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 645       
__________

In [94]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [95]:
X_train_indices = sentences_to_indices(train_data["content"].values, word_to_index, maxLen)
X_test_indices = sentences_to_indices(test_data["content"].values, word_to_index, maxLen)

In [97]:
le = LabelEncoder()
train_data_le_y = le.fit_transform(train_data[["sentiment"]])
test_data_le_y = le.transform(test_data[["sentiment"]])
le_classes = le.classes_
print(le_classes)

  y = column_or_1d(y, warn=True)


['anger' 'happiness' 'hate' 'love' 'sadness']


  y = column_or_1d(y, warn=True)


In [98]:
train_data_Y_hot_array = pd.get_dummies(pd.Series(train_data_le_y)).values
test_data_Y_hot_array = pd.get_dummies(pd.Series(test_data_le_y)).values

print(train_data_Y_hot_array.shape)
print(test_data_Y_hot_array.shape)

(396, 5)
(99, 5)


In [102]:

model.fit(X_train_indices, train_data_Y_hot_array, epochs = 50, batch_size = 32, shuffle=True)
#model.fit(X_train_indices_seq, train_data_Y_hot_array, epochs = 50, batch_size = 32, validation_data =(X_test_indices_seq, test_data_Y_hot_array))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x5a944c1550>

In [103]:
loss, acc = model.evaluate(X_test_indices, test_data_Y_hot_array)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.2929292932303265


In [128]:
predict_data = np.array(["I do not love zero miles"])
predict_data_indices = sentences_to_indices(predict_data, word_to_index, max_len = maxLen)
#print("predict_data =", predict_data)
#print("predict_data_indices =", predict_data_indices)

pred = model.predict(predict_data_indices)
index_of_sftmax = list(pred[0]).index(max(pred[0]))
predicted_le_class = le_classes[index_of_sftmax]


print("Content : ", predict_data[0], "<==> Prediction :", predicted_le_class)

Content :  I do not love zero miles <==> Prediction : sadness
