<a href="https://colab.research.google.com/github/SouvikakaPuka/Data-Science/blob/master/MovieReviewsKerasSent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import the pandas library to read our dataset
import pandas as pd
from pandas import DataFrame, Series
# Get the train/test split package from sklearn for preparing our dataset to
# train and test the model with
from sklearn.model_selection import train_test_split
#For PCA
from sklearn.decomposition import PCA, TruncatedSVD
# Import the numpy library to work with and manipulate the data
import numpy as np
import nltk
import random
nltk.download('punkt')
nltk.download('stopwords')
# nltk.download('movie_reviews')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import keras
from keras.models import Sequential
from keras.models import Model
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, GlobalAveragePooling1D, BatchNormalization, Dropout
from keras.optimizers import SGD, Adam, Nadam, RMSprop, Adamax
from keras.utils import to_categorical

Using TensorFlow backend.


In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/cacoderquan/Sentiment-Analysis-on-the-Rotten-Tomatoes-movie-review-dataset/master/train.tsv', sep='\t')
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
X = data['Phrase']
Y = data['Sentiment']
print(X.shape, Y.shape)

(156060,) (156060,)


In [5]:
# Get number of unique sentences
numSentences = data['SentenceId'].max()

# extract full sentences only from the dataset
fullSentences = []
curSentence = 0
for i in range(data.shape[0]):
  if data['SentenceId'][i]> curSentence:
    fullSentences.append((data['Phrase'][i], data['Sentiment'][i]))
    curSentence = curSentence +1

len(fullSentences)

8544

In [0]:
# put data into a df
fullSentDf = pd.DataFrame(fullSentences, columns=['Phrase', 'Sentiment'])

In [7]:
# Check class imbalance in tokenized sentences
data['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [8]:
# Check class imbalance in full sentences
fullSentDf['Sentiment'].value_counts()

3    2325
1    2203
2    1659
4    1282
0    1075
Name: Sentiment, dtype: int64

In [0]:
documents = []
#convert data into list format

# Use only complete sentences
for i in range(fullSentDf.shape[0]):
  tmpWords = word_tokenize(fullSentDf['Phrase'][i])
  documents.append((tmpWords, fullSentDf['Sentiment'][i]))


In [10]:
random.seed(9001)
random.shuffle(documents)
print(documents[:][0])

(['Full', 'Frontal', 'had', 'no', 'effect', 'and', 'elicited', 'no', 'sympathies', 'for', 'any', 'of', 'the', 'characters', '.'], 1)


In [11]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer
porter = PorterStemmer()
lancaster=LancasterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_en = stopwords.words("english")
punctuations="?:!.,;'\"-()"

#parameters to adjust to see the impact on outcome
remove_stopwords = True
useStemming = False
useLemma = True
removePuncs = True

for l in range(len(documents)):
  label = documents[l][1]
  tmpReview = []
  for w in documents[l][0]:
    newWord = w
    if remove_stopwords and (w in stopwords_en):
      continue
    if removePuncs and (w in punctuations):
      continue
    if useStemming:
      #newWord = porter.stem(newWord)
      newWord = lancaster.stem(newWord)
    if useLemma:
      newWord = wordnet_lemmatizer.lemmatize(newWord)
    tmpReview.append(newWord)
  documents[l] = (' '.join(tmpReview), label)
print(documents[2])

('Ian Holm conquers France earthy Napoleon', 3)


In [0]:
# all_data = pd.DataFrame(documents, columns=['text', 'sentiment'])
# Splits the dataset so 70% is used for training and 30% for testing
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 2003)

In [0]:
all_data = pd.DataFrame(documents,
                                columns=['text', 'sentiment'])
# Splits the dataset so 70% is used for training and 30% for testing
X_train, X_test, Y_train, Y_test = train_test_split(all_data['text'], all_data['sentiment'], test_size=0.3)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#Initialize number of features to be considered
vec_length = 2500
# Transform each text into a vector of word counts
# vectorizer = CountVectorizer(stop_words="english",
#                              ngram_range=(1, 1))
vectorizer = TfidfVectorizer(stop_words="english",
                            ngram_range=(1, 1))
# vectorizer = CountVectorizer(stop_words="english",
#                              ngram_range=(1, 1), max_features = vec_length)
# vectorizer = TfidfVectorizer(stop_words="english",
#                             ngram_range=(1, 1), max_features = vec_length)
#X = vectorizer.fit_transform(all_data["text"])
#Y = all_data['sentiment']
x_train = vectorizer.fit_transform(X_train)
y_train = Y_train
x_test = vectorizer.transform(X_test)
y_test = Y_test

In [14]:
# Converts the datasets to numpy arrays to work with our PyTorch model

np.random.seed(9001)

#Convert the training data
# x_train_np = x_train
x_train_np = x_train.toarray()
y_train_np = to_categorical(y_train)
# y_train_np = np.array(y_train)

# Convert the testing data
# x_test_np = x_test
x_test_np = x_test.toarray()
y_test_np = to_categorical(y_test)
# y_test_np = np.array(y_test)

print(x_train_np.shape)
print(y_train_np.shape)
print(x_test_np.shape)
print(y_test_np.shape)

(109242, 2500)
(109242, 5)
(46818, 2500)
(46818, 5)


In [0]:
# Changing input variable distribution to format to feed the model

x_train_np = np.expand_dims(x_train_np, axis=2)
x_test_np = np.expand_dims(x_test_np, axis=2)

In [16]:
#create model

model = Sequential()
model.add(BatchNormalization())
model.add(Conv1D(filters = 128, kernel_size=1, activation='relu', input_shape=(x_train_np.shape[1],1)))
# model.add(MaxPooling1D(pool_size =2))
model.add(Conv1D(filters = 128, kernel_size=1, activation='relu'))
# model.add(MaxPooling1D(pool_size =2))
# model.add(Conv1D(filters = 128, kernel_size=1, activation='relu'))
model.add(MaxPooling1D(pool_size =2))
# model.add(GlobalAveragePooling1D())
model.add(Flatten())
model.add(Dense(100, activation='relu'))
# model.add(BatchNormalization())
# model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))




In [0]:
# Create metrics for Precision, Recall, and F-1 Score

from keras import backend as K

def recall(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
  recall = true_positives / (possible_positives + K.epsilon())
  return recall

def precision(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
  precision = true_positives / (predicted_positives + K.epsilon())
  return precision

def f1score(y_true, y_pred):

  prec = precision(y_true, y_pred)
  rec = recall(y_true, y_pred)
  f1score = (2 * (prec * rec)/(rec + prec))
  return f1score


In [18]:
# Optimizers
adam = keras.optimizers.Adam(lr = 0.0001)
sgd = keras.optimizers.SGD(lr = 0.0001)
nadam = keras.optimizers.Nadam(lr = 0.0001)
rms = keras.optimizers.RMSprop(lr = 0.0001)
adamax = keras.optimizers.Adamax(lr = 0.0001)
#Compiling the model
model.compile(optimizer=adamax, loss='categorical_crossentropy', metrics=['acc', recall, precision, f1score])




In [19]:
batch_size = 128  # Batch size
epochs = 10 # Number of eopchs

#Training the model
history = model.fit(x_train_np, y_train_np, validation_data=( x_test_np, y_test_np), epochs= epochs, batch_size = batch_size)






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 109242 samples, validate on 46818 samples
Epoch 1/10





Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
#Evaluating the model on training data

train_loss, train_accuracy, train_recall, train_precision, train_f1score = model.evaluate(x_train_np, y_train_np)
print("Train Loss: ", train_loss)
print("Train Accuracy: ", train_accuracy)
print("Train Recall: ", train_recall)
print("Train Precision: ", train_precision)
print("Train F1 Score: ", train_f1score)

Train Loss:  0.7927420104840458
Train Accuracy:  0.7050401860068617
Train Recall:  0.6286593068654551
Train Precision:  0.7517089961854955
Train F1 Score:  0.6836895944586747


In [21]:
#Evaluating the model on test data

test_loss, test_accuracy, test_recall, test_precision, test_f1score = model.evaluate(x_test_np, y_test_np)
print("Test Loss: ", test_loss)
print("Test Accuracy: ", test_accuracy)
print("Test Recall: ", test_recall)
print("Test Precision: ", test_precision)
print("Test F1 Score: ", test_f1score)

Test Loss:  1.0302822533806333
Test Accuracy:  0.6120936391985988
Test Recall:  0.5372933487120338
Test Precision:  0.6529824115815587
Test F1 Score:  nan


In [0]:
model.save('1116613_1dconv_sent.h5')