In [0]:
# Import the pandas library to read our dataset
import pandas as pd
#To split the data into training and test data
from sklearn.model_selection import train_test_split

#To manipulate data
import numpy as np
from keras.utils import to_categorical

In [6]:
# import data from the link
data = pd.read_csv('https://raw.githubusercontent.com/cacoderquan/Sentiment-Analysis-on-the-Rotten-Tomatoes-movie-review-dataset/master/train.tsv', sep='\t')
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [7]:
#libraries used for data preprocessing
import nltk
import random
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
#splitting each review into documents
documents = []

for i in range(data.shape[0]):
  tmpWords = word_tokenize(data['Phrase'][i])
  documents.append((tmpWords, data['Sentiment'][i]))

print(documents[1])

(['A', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose'], 2)


In [9]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer

porter = PorterStemmer()
lancaster=LancasterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_en = stopwords.words("english")
punctuations="?:!.,;'\"-()"

#parameters to adjust to see the impact on outcome
remove_stopwords = True
useStemming = True
useLemma = False
removePuncs = True

for l in range(len(documents)):
  label = documents[l][1]
  tmpReview = []
  for w in documents[l][0]:
    newWord = w
    if remove_stopwords and (w in stopwords_en):
      continue
    if removePuncs and (w in punctuations):
      continue
    if useStemming:
      
      newWord = lancaster.stem(newWord)
    if useLemma:
      newWord = wordnet_lemmatizer.lemmatize(newWord)
    tmpReview.append(newWord)
  documents[l] = (' '.join(tmpReview), label)
print(documents[1])

('a sery escapad demonst ad good goos', 2)


In [0]:
all_data = pd.DataFrame(documents, columns=['Phrase', 'Sentiment'])
x_train, x_test, y_train, y_test = train_test_split(all_data['Phrase'], all_data['Sentiment'], train_size = 0.7, shuffle = True, random_state = 2003)

In [0]:
#vectorize the data 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english', ngram_range= (1,1), max_features=1500)
X = vectorizer.fit_transform(all_data['Phrase'])
x_train = vectorizer.transform(x_train)
x_test = vectorizer.transform(x_test)

In [0]:
x_train_np = x_train.toarray()
y_train_np = to_categorical(y_train)
x_test_np = x_test.toarray()
y_test_np = to_categorical(y_test)

In [0]:
x_train = np.expand_dims(x_train_np, axis=2)
x_test = np.expand_dims(x_test_np, axis=2)

In [14]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D

batch_size = 128

#model creation
model = Sequential()
#defing the layers
model.add(Conv1D(filters = 128, kernel_size=1, activation='relu', input_shape=(x_train_np.shape[1],1)))
model.add(Conv1D(filters = 128, kernel_size=1, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Conv1D(filters = 128, kernel_size=1, activation='relu'))
model.add(Conv1D(filters = 128, kernel_size=1, activation='relu'))
model.add(MaxPooling1D(pool_size =2))
model.add(Flatten())
keras.layers.Dropout(0.1, noise_shape=None, seed=None)
model.add(Dense(100, activation='relu'))
model.add(Dense(5, activation='softmax'))






In [15]:
model.summary()


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 1500, 128)         256       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1500, 128)         16512     
_________________________________________________________________
dense_1 (Dense)              (None, 1500, 100)         12900     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 1500, 128)         12928     
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 1500, 128)         16512     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 750, 128)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 96000)            

In [16]:
#model compilation and optimizer used is adamax
model.compile(optimizer='adamax', loss='categorical_crossentropy', metrics=['acc'])





In [18]:
#Trainig the model
model.fit(x_train, y_train_np, validation_data=(x_test, y_test_np), epochs = 10, batch_size = 128)

Train on 109242 samples, validate on 46818 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc71e6cae48>

In [19]:


#checking the metrics
train_accu = model.evaluate(x_train, y_train_np, verbose=0) 
test_accu = model.evaluate(x_test, y_test_np, verbose=0)

print("train accuracy: %.2f%%" % (train_accu[1]*100))
print("test_accuracy: %.2f%%" % (test_accu[1]*100))



train accuracy: 73.81%
test_accuracy: 61.35%


In [20]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
y_pred = model.predict_classes(x_test, batch_size=128, verbose=0)
y_pred1=np.argmax(y_test_np, axis=1)

accuracy = accuracy_score(y_pred1, y_pred)
precision = precision_score(y_pred1, y_pred, average='weighted')
f1 = f1_score(y_pred1, y_pred, average='weighted')
recall = recall_score(y_pred1, y_pred, average='weighted')
print('Accuracy: %f' % accuracy)
print('Precision: %f' % precision)
print('Recall: %f' % recall)
print('F1 score: %f' % f1)


Accuracy: 0.613546
Precision: 0.591608
Recall: 0.613546
F1 score: 0.591169


In [0]:
model.save('1111385_1dconv_reg.h5')

In [0]:
from keras.models import load_model
model=load_model('1111385_1dconv_reg.h5')