# Text preprocessing & Creating the model

In [21]:
# Import libraries

# General
import pandas as pd
import numpy as np 
import seaborn as sns

# Preprocessing data
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler

# Model architecture
import keras
from keras import models, regularizers, layers, optimizers, losses, metrics

In [2]:
# Import the dataset
bj = pd.read_csv('data\\cleaned_bj.csv')
bj.head(3)

Unnamed: 0,key,author,date,stars,helpful_yes,helpful_no,text,name,subhead,description,rating,rating_count,ingredients,text_len,ratio_helpful,sentiment,sentiment_cat
0,0_bj,Ilovebennjerry,2017-04-15,3,10,3,"Super good, don't get me wrong. But I came for...",Salted Caramel Core,Sweet Cream Ice Cream with Blonde Brownies & a...,Find your way to the ultimate ice cream experi...,3.7,208,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),...",603,0.769231,0,negative
1,0_bj,LaTanga71,2018-04-26,3,5,2,My caramel core begins to disappear about half...,Salted Caramel Core,Sweet Cream Ice Cream with Blonde Brownies & a...,Find your way to the ultimate ice cream experi...,3.7,208,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),...",715,0.714286,0,negative
2,0_bj,Kassidyk,2020-07-24,1,1,5,This ice cream is worst ice cream I’ve ever ta...,Salted Caramel Core,Sweet Cream Ice Cream with Blonde Brownies & a...,Find your way to the ultimate ice cream experi...,3.7,208,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),...",623,0.166667,0,negative


## Text preprocessing
To be able to process the text, which is needed for the sentiment analyzer, first all the noise is removed. This means converting all text to lower cases, only remain alphanumerical characters, lemmatize words to their dictionary form and remove stopwords.

In [3]:
#lower case all text
bj["text"]=bj["text"].str.lower()

#tokenization of words
bj["text"] = bj.apply(lambda row: word_tokenize(row['text']), axis=1)

#only alphanumerical values
bj["text"] = bj['text'].apply(lambda x: [item for item in x if item.isalpha()])

#lemmatazing words
bj["text"] = bj['text'].apply(lambda x : [WordNetLemmatizer().lemmatize(y) for y in x])

# Removing useless words
stop = stopwords.words('english')
bj["text"] = bj['text'].apply(lambda x: [item for item in x if item not in stop])

#joining all tokens
bj["text"] = bj["text"].apply(lambda x: str(' '.join(x)))

### Tokenization

In [4]:
max_words = 15000
max_len = bj['text_len'].max()

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(bj["text"])
sequences = tokenizer.texts_to_sequences(bj["text"])

In [5]:
# Add sequences and store it in variable X (feature)
X = pad_sequences(sequences, maxlen=max_len)
print(X)

[[  0   0   0 ... 269  18  60]
 [  0   0   0 ... 369  19 650]
 [  0   0   0 ...  84 193  58]
 ...
 [  0   0   0 ...  42 256 124]
 [  0   0   0 ...  28 144 110]
 [  0   0   0 ... 523  76  76]]


In [6]:
# Set target variable y
y = bj['sentiment']

### Train test split

In [7]:
# Split the dataset into a training (80%) and testing (20%) set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

<IPython.core.display.Javascript object>

In [8]:
# Split training set into training and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.20, random_state = 42)

<IPython.core.display.Javascript object>

### Normalisation

In [9]:
scaler = StandardScaler()

# Fit only to the training data
scaler.fit(X_train)

StandardScaler()

In [10]:
#From pdataframe to numpy_array 
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

## Model architecture

### LSTM model

In [16]:
# Set vocabulary size (same as max words for tokenizing)
vocab = bj['text_len'].max()

In [18]:
# Set the model 
model = keras.Sequential()

<IPython.core.display.Javascript object>

In [19]:
# Add embedding layer to create a 3D floating-point tensor that can be processed by the NN
model.add(keras.layers.Embedding(vocab, 64))

<IPython.core.display.Javascript object>

In [22]:
# Add LSTM layer > dropout=0.5?
model.add(layers.LSTM(15, dropout = 0.3))

In [23]:
# Add dense layer: sigmoid function because of binary classification (positive, negative)
model.add(layers.Dense(1,activation='sigmoid'))

In [27]:
# Summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          91264     
_________________________________________________________________
lstm (LSTM)                  (None, 15)                4800      
_________________________________________________________________
dense (Dense)                (None, 1)                 16        
Total params: 96,080
Trainable params: 96,080
Non-trainable params: 0
_________________________________________________________________


In [28]:
# Compile the model: adam optimzer and binary_crossentropy (because of two outcomes: positive and negative)
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [29]:
# Fit the model to the training and validation data > want to add batch_size?
history = model.fit(X_train, y_train, epochs=40,validation_data=(X_val, y_val))

Epoch 1/40

InvalidArgumentError:  indices[21,1284] = 2673 is not in [0, 1426)
	 [[node sequential_1/embedding/embedding_lookup (defined at <ipython-input-29-227416541c74>:2) ]] [Op:__inference_test_function_4046]

Errors may have originated from an input operation.
Input Source operations connected to node sequential_1/embedding/embedding_lookup:
 sequential_1/embedding/embedding_lookup/3532 (defined at C:\Users\RianneRendering\anaconda3\lib\contextlib.py:113)

Function call stack:
test_function


In [None]:
# Evualuating the model on the test data
train_results = model.evaluate(X_train, y_train)
test_results = model.evaluate(X_test, y_test)