In [64]:
import pandas as pd
import csv
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
review1 = pd.read_csv('sentiment-analysis-on-movie-reviews/train.csv')
review = review1.iloc[0:4000]
test = review1.iloc[4001:4200] 

### storing test set

y_test=np.asarray(test.Sentiment, dtype=int)
x_test=np.asarray(test.Phrase)

### storing training set

y_train=np.asarray(review.Sentiment, dtype=int)
x_train=review.Phrase

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/utkarsh.verma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
### processing the glove dataset to get embedding vector

def read_glove_vecs(glove_file):
    with open(glove_file, 'r',encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [21]:
### getting/storing the embedding vector

words_to_index, index_to_words, word_to_vec_map = read_glove_vecs("sentiment-analysis-on-movie-reviews/glove.6B.50d.txt")

In [22]:
### softmax function

def softmax(vector):
	e = np.exp(vector)
	return e / e.sum()

In [23]:
### RNN Model to learn

def model(x_train,y_train,word_to_vec_map,learning_rate=0.001,ite=800):
    #forward propagation
    C=5
    n_y=5
    b = np.zeros((5,1))
    W = np.random.randn(5, 50) / np.sqrt(50)
    Y_oh = convert_to_one_hot(y_train,C)
    for t in range(ite):                       
        for i in range(x_train.shape[0]):                                
            avg = avgvec(x_train[i], word_to_vec_map)
            z = np.dot(W,avg.reshape(50,1))+ b 
            a = softmax(z)
            # Compute cost using the i'th training label's one hot representation and "A" (the output of the softmax)
            cost = -Y_oh[i]*np.log(a)
            # Compute gradients 
            dz = a.reshape(5,1) - Y_oh[i].reshape(5,1)
            dW = np.dot(dz.reshape(n_y,1), avg.reshape(1, 50))
            db = dz

            # Update parameters with Stochastic Gradient Descent
            W = W - learning_rate * dW
            b = b - learning_rate * db
        
        if t % 100 == 0:
            print("Epoch: " + str(t) + " --- cost = " + str(cost))
            pred = predict(x_train, y_train, W, b, word_to_vec_map)
    return W, b, pred         

In [24]:
### onehot encoding for y

def convert_to_one_hot(Y, C):
    #Y= np.asarray(Y, dtype=int)
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [98]:
def predict(x_test,y_test,W,b,word_to_vec_map):
    #x_test=x_test.apply(lambda x: x.lower().split())
    pred=np.zeros((y_test.shape[0],1))
    for i in range(y_test.shape[0]):
        avg = avgvec(x_test[i], word_to_vec_map)
        z = np.dot(W,avg.reshape(50,1)) + b
        a = softmax(z)
        pred[i] = np.argmax(a)
        
    print("Accuracy: "  + str(np.mean((pred[:] == y_test.reshape(y_test.shape[0],1)[:]))))
    
    return pred   

In [33]:
### average of the sentence

def avgvec(sent,word_to_vec_map):
    lemmatizer = WordNetLemmatizer()
    sent = sent.lower()
    #sent = re.sub('[^a-zA-Z]','',)
    words = nltk.word_tokenize(sent)
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    avg = np.zeros(word_to_vec_map["no"].shape)
    xword=['substitutable','self-glorification','shiver-inducing','nerve-rattling','hell-jaunt','-lrb-','-rrb-','oscar-size','right-thinking','ham-fisted','middle-agers','#name?','through-line','non-techies','writer\\/director','second-guess','monsterous','fillm','over-indulgent','self-glorified','forgettably','candy-coat','screwed-up','pulpiness','ultra-cheesy','all-enveloping','tryingly','windtalker','tatter']
    for w in words:
        if w not in xword:
            avg += word_to_vec_map[w]
    if len(words)>0:       
        avg = avg/float(len(words))
    return avg

In [34]:
W,b,pred=model(x_train,y_train,word_to_vec_map)

Epoch: 0 --- cost = [[0.         0.         2.07507291 0.         0.        ]
 [0.         0.         1.6720141  0.         0.        ]
 [0.         0.         1.02432776 0.         0.        ]
 [0.         0.         1.65092769 0.         0.        ]
 [0.         0.         1.99749402 0.         0.        ]]
Accuracy: 0.5775
Epoch: 100 --- cost = [[0.         0.         4.14784182 0.         0.        ]
 [0.         0.         2.69339109 0.         0.        ]
 [0.         0.         0.1853502  0.         0.        ]
 [0.         0.         2.57654327 0.         0.        ]
 [0.         0.         4.63555548 0.         0.        ]]
Accuracy: 0.60925
Epoch: 200 --- cost = [[0.         0.         4.35957541 0.         0.        ]
 [0.         0.         2.66638795 0.         0.        ]
 [0.         0.         0.18129103 0.         0.        ]
 [0.         0.         2.55295792 0.         0.        ]
 [0.         0.         5.17260638 0.         0.        ]]
Accuracy: 0.6105
Epoch: 300 

In [36]:
###   LSTM model

import kerastuner as kt
def model_lstm(x_train,words_to_index,word_to_vec_map):
    
    input_shape = embedding_vector(x_train,words_to_index).shape
    input_tensor = Input(shape=(input_shape[1],),dtype='int32')
    
    vocab_len = len(words_to_index) + 1                  
    emb_dim = word_to_vec_map["no"].shape[0]
    emb_matrix = np.zeros((vocab_len,emb_dim))
    for word, index in words_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    embedding_layer = Embedding(vocab_len, emb_dim, trainable = False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    embeddings = embedding_layer(input_tensor)
    #hp_units = hp.Int('units', min_value = 32, max_value = 512, step = 32)
    X=LSTM(128, return_sequences=True)(embeddings)
    X=Dropout(0.5)(X)
    X=LSTM(128, return_sequences=False)(X)
    X=Dropout(0.5)(X)
    X=Dense(5, activation='softmax')(X)
    
    
    model = Model(inputs=input_tensor, outputs=X)
    
    return model

In [35]:
###  Embedding Vector 

def embedding_vector(x_train,words_to_index):
    xword=['substitutable','self-glorification','shiver-inducing','nerve-rattling','hell-jaunt','-lrb-','-rrb-','oscar-size','right-thinking','ham-fisted','middle-agers','#name?','through-line','non-techies','writer\\/director','second-guess','monsterous','fillm','over-indulgent','self-glorified','forgettably','candy-coat','screwed-up','pulpiness','ultra-cheesy','all-enveloping','tryingly','windtalker','tatter']
    maxlen = len(max(x_train, key=len).split())
    embedding_vec=np.zeros((x_train.shape[0],maxlen))
    for i in range(x_train.shape[0]):
        sent = x_train[i]
        lemmatizer = WordNetLemmatizer()
        sent = sent.lower()
    #sent = re.sub('[^a-zA-Z]','',)
        words = nltk.word_tokenize(sent)
        words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
        for j in range(len(words)):
                if words[j] not in xword:
                    embedding_vec[i][j] = words_to_index[words[j]]
    
    return embedding_vec    
        
    

In [23]:
embedding_vector(x_train,words_to_index).shape

(4000, 42)

In [37]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
model = model_lstm(x_train,words_to_index,word_to_vec_map)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 42)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 42, 50)            20000050  
_________________________________________________________________
lstm (LSTM)                  (None, 42, 128)           91648     
_________________________________________________________________
dropout (Dropout)            (None, 42, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 5)                

In [38]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [39]:
x_train_indices = embedding_vector(x_train,words_to_index)
Y_train_oh = convert_to_one_hot(y_train, C = 5)

In [40]:
fit_data=model.fit(x_train_indices, Y_train_oh, epochs = 50, batch_size = 64, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [41]:
x_test_indices = embedding_vector(x_test,words_to_index)
y_test_oh = convert_to_one_hot(y_test, C = 5)
loss, acc = model.evaluate(x_test_indices, y_test_oh)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.5477386713027954


In [208]:
input_shape = embedding_vector(x_train,words_to_index).shape
vocab_len = len(words_to_index) + 1                  
emb_dim = word_to_vec_map["no"].shape[0]
emb_matrix = np.zeros((vocab_len,emb_dim))
for word, index in words_to_index.items():
    emb_matrix[index, :] = word_to_vec_map[word]

In [214]:
from kerastuner import HyperModel
from tensorflow import keras

class CNNHyperModel(HyperModel):

    def __init__(self,input_shape, num_classes):
        self.input_shape = input_shape
        self.num_classes = num_classes

    def build(self, hp):
        model = keras.Sequential()
        input_tensor=Input(shape=(input_shape[1],),dtype='int32')
        #model.add(Input(shape=(input_shape[1],),dtype='int32'))
        embedding_layer = Embedding(vocab_len, emb_dim, trainable = False)
        embedding_layer.build((None,))
        embedding_layer.set_weights([emb_matrix])
        embeddings = embedding_layer(input_tensor)
        model.add(Embedding(input_dim = vocab_len, output_dim = emb_dim, input_length = 42, trainable = False).set_weights([emb_matrix]))
        model.add(LSTM(128, return_sequences=True))
        model.add(Dropout(rate=hp.Float(
                'dropout_1',
                min_value=0.0,
                max_value=0.5,
                default=0.25,
                step=0.05,
            ))
                 )
        model.add(LSTM(128, return_sequences=False))
        model.add(Dropout(rate=hp.Float(
                'dropout_2',
                min_value=0.0,
                max_value=0.5,
                default=0.25,
                step=0.05,
            ))
                 )
        model.add(Dense(
                units=hp.Int(
                    'units',
                    min_value=32,
                    max_value=512,
                    step=32,
                    default=128
                ),
                activation=hp.Choice(
                    'dense_activation',
                    values=['relu', 'tanh', 'sigmoid'],
                    default='relu'
                )
            )
                 )
        model.compile(
            optimizer=keras.optimizers.Adam(
                hp.Choice('learning_rate',
                          values=[1e-2, 1e-3, 1e-4])),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy'])
        return model
    
hypermodel = CNNHyperModel(input_shape=42, num_classes=5)

In [215]:
from kerastuner.tuners import RandomSearch
SEED = 1
MAX_TRIALS=20
EXECUTION_PER_TRIAL=2
tuner = RandomSearch(
    hypermodel,
    objective='val_accuracy',
    seed=SEED,
    max_trials=MAX_TRIALS,
    executions_per_trial=EXECUTION_PER_TRIAL,
    directory='random_search',
    project_name='cifar10'
)

Traceback (most recent call last):
  File "/Users/utkarsh.verma/opt/anaconda3/lib/python3.8/site-packages/kerastuner/engine/hypermodel.py", line 104, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-214-c477df2ef153>", line 18, in build
    model.add(Embedding(input_dim = vocab_len, output_dim = emb_dim, input_length = 42, trainable = False).set_weights([emb_matrix]))
  File "/Users/utkarsh.verma/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1806, in set_weights
    raise ValueError(
ValueError: You called `set_weights(weights)` on layer "embedding_1" with a weight list of length 1, but the layer was expecting 0 weights. Provided weights: [array([[ 0.      ,  0.      ,  0.      , ...,  0....


Invalid model 0/5


Traceback (most recent call last):
  File "/Users/utkarsh.verma/opt/anaconda3/lib/python3.8/site-packages/kerastuner/engine/hypermodel.py", line 104, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-214-c477df2ef153>", line 18, in build
    model.add(Embedding(input_dim = vocab_len, output_dim = emb_dim, input_length = 42, trainable = False).set_weights([emb_matrix]))
  File "/Users/utkarsh.verma/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1806, in set_weights
    raise ValueError(
ValueError: You called `set_weights(weights)` on layer "embedding_1" with a weight list of length 1, but the layer was expecting 0 weights. Provided weights: [array([[ 0.      ,  0.      ,  0.      , ...,  0....


Invalid model 1/5


Traceback (most recent call last):
  File "/Users/utkarsh.verma/opt/anaconda3/lib/python3.8/site-packages/kerastuner/engine/hypermodel.py", line 104, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-214-c477df2ef153>", line 18, in build
    model.add(Embedding(input_dim = vocab_len, output_dim = emb_dim, input_length = 42, trainable = False).set_weights([emb_matrix]))
  File "/Users/utkarsh.verma/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1806, in set_weights
    raise ValueError(
ValueError: You called `set_weights(weights)` on layer "embedding_1" with a weight list of length 1, but the layer was expecting 0 weights. Provided weights: [array([[ 0.      ,  0.      ,  0.      , ...,  0....


Invalid model 2/5


Traceback (most recent call last):
  File "/Users/utkarsh.verma/opt/anaconda3/lib/python3.8/site-packages/kerastuner/engine/hypermodel.py", line 104, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-214-c477df2ef153>", line 18, in build
    model.add(Embedding(input_dim = vocab_len, output_dim = emb_dim, input_length = 42, trainable = False).set_weights([emb_matrix]))
  File "/Users/utkarsh.verma/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1806, in set_weights
    raise ValueError(
ValueError: You called `set_weights(weights)` on layer "embedding_1" with a weight list of length 1, but the layer was expecting 0 weights. Provided weights: [array([[ 0.      ,  0.      ,  0.      , ...,  0....


Invalid model 3/5


Traceback (most recent call last):
  File "/Users/utkarsh.verma/opt/anaconda3/lib/python3.8/site-packages/kerastuner/engine/hypermodel.py", line 104, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-214-c477df2ef153>", line 18, in build
    model.add(Embedding(input_dim = vocab_len, output_dim = emb_dim, input_length = 42, trainable = False).set_weights([emb_matrix]))
  File "/Users/utkarsh.verma/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1806, in set_weights
    raise ValueError(
ValueError: You called `set_weights(weights)` on layer "embedding_1" with a weight list of length 1, but the layer was expecting 0 weights. Provided weights: [array([[ 0.      ,  0.      ,  0.      , ...,  0....


Invalid model 4/5
Invalid model 5/5


Traceback (most recent call last):
  File "/Users/utkarsh.verma/opt/anaconda3/lib/python3.8/site-packages/kerastuner/engine/hypermodel.py", line 104, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-214-c477df2ef153>", line 18, in build
    model.add(Embedding(input_dim = vocab_len, output_dim = emb_dim, input_length = 42, trainable = False).set_weights([emb_matrix]))
  File "/Users/utkarsh.verma/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1806, in set_weights
    raise ValueError(
ValueError: You called `set_weights(weights)` on layer "embedding_1" with a weight list of length 1, but the layer was expecting 0 weights. Provided weights: [array([[ 0.      ,  0.      ,  0.      , ...,  0....


RuntimeError: Too many failed attempts to build model.

In [80]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
corpus = []
for i in range(x_train.shape[0]):
    review_text = re.sub('[^a-zA-Z]', ' ', x_train[i])
    review_text = review_text.lower()
    review_text = review_text.split()
    review_text = [ps.stem(word) for word in review_text if not word in set(stopwords.words('english'))]
    review_text = ' '.join(review_text)
    corpus.append(review_text)
    
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer(max_features = 1500)
cv_tf = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()
X_tf = cv_tf.fit_transform(corpus).toarray()

In [99]:
# RNN Model to learn(For input vector of BOW & TFIDF)




def model_tf_bw(x_train,y_train,word_to_vec_map,learning_rate=0.001,ite=800):
    #forward propagation
    C=5
    n_y=5
    b = np.zeros((5,1))
    W = np.random.randn(5, 1017) / np.sqrt(1017)
    Y_oh = convert_to_one_hot(y_train,C)
    for t in range(ite):                       
        for i in range(x_train.shape[0]):                                
            #avg = avgvec(x_train[i], word_to_vec_map)
            z = np.dot(W,X[i].reshape(1017,1))+ b 
            a = softmax(z)
            # Compute cost using the i'th training label's one hot representation and "A" (the output of the softmax)
            cost = -np.log(a)*Y_oh[i]
            # Compute gradients 
            dz = a.reshape(5,1) - Y_oh[i].reshape(5,1)
            dW = np.dot(dz.reshape(n_y,1), X[i].reshape(1, 1017))
            db = dz

            # Update parameters with Stochastic Gradient Descent
            W = W - learning_rate * dW
            b = b - learning_rate * db
        
        if t % 100 == 0:
            print("Epoch: " + str(t) + " --- cost = " + str(cost))
            pred = predict(x_train, y_train, W, b, word_to_vec_map)
    return W, b, pred      
    