# Sentiment Classification


### Loading the dataset (5 points)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import os

import warnings
warnings.filterwarnings('ignore')

In [None]:
%matplotlib inline

In [None]:
from keras.datasets import imdb

vocab_size = 10000 #vocab size

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) # vocab_size is no.of words to consider from the dataset, ordering based on frequency.

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_test.shape

# Classes of Sentiments

In [None]:
print(np.unique(y_train))

# Number of words in training set

In [None]:
print(len(np.unique(np.hstack(x_train))))

In [None]:
from keras.preprocessing.sequence import pad_sequences
vocab_size = 10000 #vocab size
maxlen = 300  #number of word used from each review

## Train test split ( 5 points)

In [None]:
#load dataset as a list of ints
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
#make all sequences of the same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test =  pad_sequences(x_test, maxlen=maxlen)

In [None]:
x_train[0]

In [None]:
# The data is already in vectorized format

In [None]:
y_train[0]

## Build Keras Embedding Layer Model (30 points)
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, MaxPool1D, GRU, LSTM, Dense,TimeDistributed, SpatialDropout1D

# Model compiling

In [None]:
batch_size = 64
epochs = 6
validation_split = 0.1
verbose = 2
embed_dim = 128
lstm_out = 196

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, embed_dim,input_length = maxlen))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [None]:
model.fit(x_train, y_train, epochs = epochs, batch_size=batch_size, verbose = verbose,validation_split=0.1)

## Accuracy of the model  & Retrive the output of each layer in keras for a given single test sample from the trained model you built (10 Points)

In [None]:
from sklearn.metrics import f1_score, accuracy_score

In [None]:
score,acc = model.evaluate(x_test, y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

In [None]:
x_test[3245]

In [None]:
y_test[3245]

In [None]:
x_test[3245].shape

In [None]:
from keras import backend as K

In [None]:
sample = x_test[3245]

In [None]:
%tensorflow_version 2.x

In [None]:
from keras import backend as K
from keras import layers
from tensorflow.keras.models import Model

In [None]:
outputLayer = Model(inputs = model.layers[0].input, outputs = [layer.output for layer in model.layers])
output = outputLayer.predict(np.expand_dims(x_test[0], axis = 0))
print(output)