# Libraries

In [105]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gdown

np.random.seed(32)

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.manifold import TSNE

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Conv1D, MaxPooling1D, Dropout, Dense, Embedding
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential

%matplotlib inline

In [106]:
train_file_id = '1s4RmM4rC41MacxmtqiJF50zCF_nnXRuP'
train_output_file = '/content/train.csv'
train_download_url = f'https://drive.google.com/uc?id={train_file_id}'
gdown.download(train_download_url, train_output_file, quiet=False, fuzzy = True)
df = pd.read_csv(train_output_file, encoding='latin1')

Downloading...
From: https://drive.google.com/uc?id=1s4RmM4rC41MacxmtqiJF50zCF_nnXRuP
To: /content/train.csv
100%|██████████| 99.4M/99.4M [00:00<00:00, 229MB/s]


In [107]:
df.head()

Unnamed: 0,id,brand,categories,dateAdded,dateUpdated,ean,keys,manufacturer,manufacturerNumber,name,...,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username,upc
0,AV13O1A8GV-KLJ3akUyj,Universal Music,"Movies, Music & Books,Music,R&b,Movies & TV,Mo...",2017-07-25T00:52:42Z,2018-02-05T08:36:58Z,602537000000.0,"602537205981,universalmusic/14331328,universal...",Universal Music Group / Cash Money,14331328,Pink Friday: Roman Reloaded Re-Up (w/dvd),...,,0.0,5,https://redsky.target.com/groot-domain-api/v1/...,i love this album. it's very good. more to the...,Just Awesome,Los Angeles,,Joshua,602537000000.0
1,AV14LG0R-jtxr-f38QfS,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",2017-07-25T05:16:03Z,2018-02-05T11:27:45Z,73416000391.0,lundbergorganiccinnamontoastricecakes/b000fvzw...,Lundberg,574764,Lundberg Organic Cinnamon Toast Rice Cakes,...,100209113.0,,5,https://www.walmart.com/reviews/product/29775278,Good flavor. This review was collected as part...,Good,,,Dorothy W,73416000391.0
2,AV14LG0R-jtxr-f38QfS,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",2017-07-25T05:16:03Z,2018-02-05T11:27:45Z,73416000391.0,lundbergorganiccinnamontoastricecakes/b000fvzw...,Lundberg,574764,Lundberg Organic Cinnamon Toast Rice Cakes,...,100209113.0,,5,https://www.walmart.com/reviews/product/29775278,Good flavor.,Good,,,Dorothy W,73416000391.0
3,AV16khLE-jtxr-f38VFn,K-Y,"Personal Care,Medicine Cabinet,Lubricant/Sperm...",2017-07-25T16:26:19Z,2018-02-05T11:25:51Z,67981934427.0,"kylovesensualitypleasuregel/b00u2whx8s,0679819...",K-Y,67981934427,K-Y Love Sensuality Pleasure Gel,...,113026909.0,,1,https://www.walmart.com/reviews/product/43383370,I read through the reviews on here before look...,Disappointed,,,Rebecca,67981934427.0
4,AV16khLE-jtxr-f38VFn,K-Y,"Personal Care,Medicine Cabinet,Lubricant/Sperm...",2017-07-25T16:26:19Z,2018-02-05T11:25:51Z,67981934427.0,"kylovesensualitypleasuregel/b00u2whx8s,0679819...",K-Y,67981934427,K-Y Love Sensuality Pleasure Gel,...,171267657.0,,1,https://www.walmart.com/reviews/product/43383370,My husband bought this gel for us. The gel cau...,Irritation,,,Walker557,67981934427.0


In [108]:
df['reviews.rating'] = df['reviews.rating']<4

## Train Test Split

In [109]:
X_train, X_test, y_train, y_test = train_test_split(df['reviews.text'],df['reviews.rating'],test_size = 0.2)

### Preprocessing text for the (supervised) CBOW model

In [110]:
MAX_NB_WORDS = 20000

X_train = X_train.astype(str)
X_test = X_test.astype(str)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(X_train)

sequences = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 26057 unique tokens.


In [111]:
sequences[0]

[589,
 37,
 1,
 128,
 80,
 125,
 5,
 690,
 29,
 61,
 3,
 244,
 130,
 341,
 17,
 21,
 1,
 887,
 8,
 1286,
 919,
 58,
 527,
 200,
 130,
 273,
 41,
 1626,
 804,
 94,
 117,
 3,
 19,
 72,
 612,
 110,
 44,
 85,
 180,
 1082,
 108,
 29,
 319,
 112,
 68,
 402,
 88,
 489,
 2,
 44,
 211,
 30,
 6,
 18,
 767]

In [112]:
type(tokenizer.word_index), len(tokenizer.word_index)

(dict, 26057)

In [113]:
index_to_word = dict((i, w) for w, i in tokenizer.word_index.items())

In [114]:
" ".join([index_to_word[i] for i in sequences[0]])

"i'd like the old tide back it cleaned so well and left clothes smelling great but the addition of acti lift has changed everything clothes come out stiff rather than soft and with no pleasant scent just an off odor i'm so disappointed i've been trying other brands i just can't use this product anymore"

In [115]:
seq_lens = [len(s) for s in sequences]
print("average length: %0.1f" % np.mean(seq_lens))
print("max length: %d" % max(seq_lens))

average length: 39.4
max length: 1034


In [116]:
MAX_SEQUENCE_LENGTH = 150

X_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_train.shape)
print('Shape of data test tensor:', X_test.shape)

Shape of data tensor: (56835, 150)
Shape of data test tensor: (14209, 150)


In [117]:
y_train = to_categorical(np.asarray(y_train))
print('Shape of label tensor:', y_train.shape)
y_test = to_categorical(np.asarray(y_test))
print('Shape of label tensor:', y_test.shape)

Shape of label tensor: (56835, 2)
Shape of label tensor: (14209, 2)


In [118]:
from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras.models import Model

EMBEDDING_DIM = 50
N_CLASSES = 2

sequence_input = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = tf.keras.layers.Embedding(MAX_NB_WORDS, EMBEDDING_DIM, trainable=True)
embedded_sequences = embedding_layer(sequence_input)
average = tf.keras.layers.GlobalAveragePooling1D()(embedded_sequences)
predictions = tf.keras.layers.Dense(N_CLASSES, activation='softmax')(average)
model = tf.keras.Model(sequence_input, predictions)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [119]:
model.fit(X_train, y_train, validation_split=0.1,
          epochs=10, batch_size=128)

Epoch 1/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - acc: 0.8625 - loss: 0.3903 - val_acc: 0.8608 - val_loss: 0.3598
Epoch 2/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - acc: 0.8671 - loss: 0.3350 - val_acc: 0.8783 - val_loss: 0.3014
Epoch 3/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 21ms/step - acc: 0.8858 - loss: 0.2797 - val_acc: 0.8876 - val_loss: 0.2728
Epoch 4/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - acc: 0.8973 - loss: 0.2533 - val_acc: 0.8964 - val_loss: 0.2590
Epoch 5/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - acc: 0.9045 - loss: 0.2366 - val_acc: 0.9008 - val_loss: 0.2510
Epoch 6/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - acc: 0.9096 - loss: 0.2267 - val_acc: 0.9069 - val_loss: 0.2474
Epoch 7/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1

<keras.src.callbacks.history.History at 0x7a8e8a533460>

In [121]:
output_test = model.predict(X_test)
print("test auc:", roc_auc_score(y_test,output_test[:,1]))

[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
Test AUC: 0.9124084820025962


In [122]:
equence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, trainable=True)(sequence_input)

x = LSTM(128, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

predictions = Dense(N_CLASSES, activation='softmax')(x)

model = Model(inputs=sequence_input, outputs=predictions)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

model.fit(X_train, y_train, validation_split=0.1, epochs=10, batch_size=128)

In [126]:
model.fit(X_train, y_train, validation_split=0.1,
          epochs=2, batch_size=128)

Epoch 1/2
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 539ms/step - acc: 0.9216 - loss: 0.2108 - val_acc: 0.9069 - val_loss: 0.2892
Epoch 2/2
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 525ms/step - acc: 0.9310 - loss: 0.1858 - val_acc: 0.9133 - val_loss: 0.2297


<keras.src.callbacks.history.History at 0x7a8ea4b3b910>

In [128]:
probs_class_1 = output_test[:, 1]

binary_labels_class_1 = y_test[:, 1]

print("Test AUC:", roc_auc_score(binary_labels_class_1, probs_class_1))

Test AUC: 0.9140721554776312


# CNN - LSTM

In [129]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = Conv1D(64, 5)(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Dropout(0.2)(x)
x = Conv1D(64, 5)(x)
x = MaxPooling1D(5)(x)
x = Dropout(0.2)(x)
x = LSTM(64)(x)
predictions = Dense(2, activation='softmax')(x)

model = Model(sequence_input, predictions)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [131]:
model.fit(X_train, y_train, validation_split=0.1,
          epochs=5, batch_size=128)

Epoch 1/5
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 111ms/step - acc: 0.9260 - loss: 0.1894 - val_acc: 0.9133 - val_loss: 0.2352
Epoch 2/5
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 107ms/step - acc: 0.9340 - loss: 0.1747 - val_acc: 0.9110 - val_loss: 0.2377
Epoch 3/5
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 99ms/step - acc: 0.9411 - loss: 0.1610 - val_acc: 0.9126 - val_loss: 0.2521
Epoch 4/5
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 98ms/step - acc: 0.9475 - loss: 0.1466 - val_acc: 0.9117 - val_loss: 0.2424
Epoch 5/5
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 101ms/step - acc: 0.9533 - loss: 0.1337 - val_acc: 0.9141 - val_loss: 0.2609


<keras.src.callbacks.history.History at 0x7a8ea4ac3160>

In [133]:
probs_class_1 = output_test[:, 1]

binary_labels_class_1 = y_test[:, 1]

print("Test AUC:", roc_auc_score(binary_labels_class_1, probs_class_1))

Test AUC: 0.911858391935076
