Import necessary classes

In [0]:

from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras import layers
from keras.preprocessing.sequence import pad_sequences

import numpy as np
import pandas as pd


Define your variables and parameters

In [0]:

epochs = 20
maxlen = 100
embedding_dim = 50
num_filters = 64
kernel_size = 5
batch_size = 32


Import the data. 


In [0]:
#If you are using Google CO-Lab,use the code below to import the file into your notebook. 
#PS: Ensure you have downloaded the data from the UCI Machine Learning Repository: https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
from google.colab import files
import io
uploaded = files.upload()
url = io.BytesIO(uploaded['yelp_labelled.txt'])

#If you are loading this file from a local directory, specify the path
#url = 'data/sentiment labelled sentences/yelp_labelled.txt'

#import the data into a data frame
data = pd.read_csv(url,names=['sentence', 'label'], sep='\t')
data.head()


Saving yelp_labelled.txt to yelp_labelled (3).txt


Unnamed: 0,sentence,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


Select the ‘sentence’ and ‘label’ columns

In [0]:
sentences=data['sentence'].values
labels=data['label'].values

Split your data into training and test set 

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    sentences, labels, test_size=0.30, random_state=1000)

Tokenize

In [0]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1 #The vocabulary size has an additional 1 due to the 0 reserved index

Pad in order to ensure that all sequences have the same length


In [0]:
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

Create the model. Note that we use  a sigmoid activation function on the last layer and the binary cross entropy for calculating loss. This is because we are doing a binary classification.

In [0]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 50)           84900     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 96, 64)            16064     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                650       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 11        
Total params: 101,625
Trainable params: 101,625
Non-trainable params: 0
_________________________________________________________________


Train and test the model

In [0]:
model.fit(X_train, y_train,
                    epochs=epochs,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=batch_size)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 1.0000
Testing Accuracy:  0.8167
