In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazonreviews/test.ft.txt.bz2
/kaggle/input/amazonreviews/train.ft.txt.bz2


In [2]:
! pip install keras_preprocessing

Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2
[0m

In [3]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential

import matplotlib.pyplot as plt
import bz2

In [4]:
data='/kaggle/input/amazonreviews'

train = '/kaggle/input/amazonreviews/train.ft.txt.bz2'
test = '/kaggle/input/amazonreviews/test.ft.txt.bz2'

In [5]:
def get_labels_and_text(file):
    labels=[]
    text=[]
    
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        text.append(x[10:].strip())
    
    labls = labels[:int(len(labels)*0.01)]
    text = text[:int(len(labels)*0.01)]
    
    return np.array(labels), text

train_labels, train_text = get_labels_and_text(train)
test_labels, test_text = get_labels_and_text(test)

In [6]:
train_df=pd.DataFrame(zip(train_text,train_labels),columns=['text','label'])
print(train_df.head())
test_df=pd.DataFrame(zip(test_text,test_labels),columns=['text','label'])
print(test_df.head())

                                                text  label
0  Stuning even for the non-gamer: This sound tra...      1
1  The best soundtrack ever to anything.: I'm rea...      1
2  Amazing!: This soundtrack is my favorite music...      1
3  Excellent Soundtrack: I truly like this soundt...      1
4  Remember, Pull Your Jaw Off The Floor After He...      1
                                                text  label
0  Great CD: My lovely Pat has one of the GREAT v...      1
1  One of the best game music soundtracks - for a...      1
2  Batteries died within a year ...: I bought thi...      0
3  works fine, but Maha Energy is better: Check o...      1
4  Great for the non-audiophile: Reviewed quite a...      1


In [7]:
train_df.head

<bound method NDFrame.head of                                                     text  label
0      Stuning even for the non-gamer: This sound tra...      1
1      The best soundtrack ever to anything.: I'm rea...      1
2      Amazing!: This soundtrack is my favorite music...      1
3      Excellent Soundtrack: I truly like this soundt...      1
4      Remember, Pull Your Jaw Off The Floor After He...      1
...                                                  ...    ...
35995  tiny but mighty and well built: used these alo...      1
35996  Perfect tweezers: I bought this tweezer two ye...      1
35997  didnt last too long: I only had a few uses out...      0
35998  not happy: This tool comes wtih it's own plast...      0
35999  Great Tweezer: Very easy to use, great tweezer...      1

[36000 rows x 2 columns]>

In [8]:
train_df[:5]

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,1
1,The best soundtrack ever to anything.: I'm rea...,1
2,Amazing!: This soundtrack is my favorite music...,1
3,Excellent Soundtrack: I truly like this soundt...,1
4,"Remember, Pull Your Jaw Off The Floor After He...",1


In [9]:
test_labels

array([1, 1, 0, ..., 0, 1, 0])

In [10]:
### PARAMETERS ### 

vocab_size = 10000
embed_dim = 64
max_length = 120
trunc_type = 'pre'
oov_tok = "<OOV>"

In [11]:
train_df_tl = train_df['text'].tolist()
train_df_tl

['Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^',
 "The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.",
 'Amazing!: This soundtrack is my favorite music of all

In [12]:
test_df_tl = test_df['text'].tolist()
test_df_tl

['Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"',
 "One of the best game music soundtracks - for a game I didn't really play: Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad and beautiful tracks I especially like, as there's not too many of those kinds of s

In [13]:
token = Tokenizer(num_words=vocab_size, oov_token=oov_tok, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
token.fit_on_texts(train_df['text'].values)

word_idx = token.word_index

In [14]:
train_seq = token.texts_to_sequences(train_df['text'].values)
train_padded = pad_sequences(train_seq, maxlen=max_length, truncating=trunc_type)

test_seq = token.texts_to_sequences(test_df['text'].values)
test_padded = pad_sequences(test_seq, maxlen=max_length)

# Modelling

In [15]:
# USING A CONVOLUTION LAYER

model_conv = Sequential([
    layers.Embedding(vocab_size, embed_dim, input_length=max_length),
    layers.Conv1D(128, 6, activation='relu'),
    layers.GlobalMaxPooling1D(),
    
    layers.Dense(48, activation='relu'),
    layers.Dense(1, activation='sigmoid'),
])

model_conv.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 64)           640000    
                                                                 
 conv1d (Conv1D)             (None, 115, 128)          49280     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 48)                6192      
                                                                 
 dense_1 (Dense)             (None, 1)                 49        
                                                                 
Total params: 695,521
Trainable params: 695,521
Non-trainable params: 0
__________________________________________________

In [30]:
# Single Layer LSTM model

model_lstm = Sequential([
    layers.Embedding(vocab_size, embed_dim, input_length=max_length),
    layers.Bidirectional(layers.LSTM(64)),
#     layers.GlobalMaxPooling1D(),
    layers.Flatten(),
    
    layers.Dense(28, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model_lstm.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 120, 64)           640000    
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 flatten (Flatten)           (None, 128)               0         
                                                                 
 dense_6 (Dense)             (None, 28)                3612      
                                                                 
 dense_7 (Dense)             (None, 1)                 29        
                                                                 
Total params: 709,689
Trainable params: 709,689
Non-trainable params: 0
________________________________________________

In [34]:
# Single Layer LSTM model

model_multi_lstm = Sequential([
    layers.Embedding(vocab_size, embed_dim, input_length=max_length),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(32)),
#     layers.GlobalMaxPooling1D(),
    layers.Flatten(),
    
    layers.Dense(28, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model_multi_lstm.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 120, 64)           640000    
                                                                 
 bidirectional_3 (Bidirectio  (None, 120, 128)         66048     
 nal)                                                            
                                                                 
 bidirectional_4 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 64)                0         
                                                                 
 dense_8 (Dense)             (None, 28)                1820      
                                                                 
 dense_9 (Dense)             (None, 1)                

In [36]:
model_gru = keras.Sequential([
    layers.Embedding(vocab_size, embed_dim, input_length=max_length),
    layers.Bidirectional(layers.GRU(32)),
    layers.Dense(28, activation='relu'),
    layers.Dense(1, activation='sigmoid'),
])

model_gru.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 120, 64)           640000    
                                                                 
 bidirectional_5 (Bidirectio  (None, 64)               18816     
 nal)                                                            
                                                                 
 dense_10 (Dense)            (None, 28)                1820      
                                                                 
 dense_11 (Dense)            (None, 1)                 29        
                                                                 
Total params: 660,665
Trainable params: 660,665
Non-trainable params: 0
_________________________________________________________________


In [37]:
# opt = keras.optimizers.SGD(learning_rate=1e-3, weight_decay=1e-2, momentum=0.9)
opt = keras.optimizers.Adam(learning_rate=1e-3)

model_gru.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy', 'Precision', 'Recall'])

In [None]:
y1 = test_df['text'].tolist()
y1

In [None]:
type(train_padded)

In [None]:
y_train = pd.get_dummies(train_df['label']).values
y_test = pd.get_dummies(test_df['label']).values

In [39]:
history = model_gru.fit(train_padded, train_df['label'].values, #y_train
                   epochs=5, validation_data=(test_padded, test_df['label']. values))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [40]:
model = model_gru

In [45]:
text1 = "If you need quality then I would suggest that you buy these pliers, they will last for Decade or more. It's sturdy pliers, which weighted nearly (371 grams) net weight, which is bit heavy and (8 inch) long.I am not particularly fond of the extra guard near the metal to insulator union, because they do not slide down as far into my tool pouch; however, the rated insulating capacity is outstanding (1000 V). They are expensive then Chinese pliers, but I believe 'You get what you pay for' I highly recommend these pliers for the above average work load."
# text = "I was really disappointed with this and was really unhelpful."
# preprocess the text data
# text = preprocess_text(text)
text_sequence = token.texts_to_sequences([text1])
padded_sequence = pad_sequences(text_sequence, maxlen=max_length)
prediction = model_gru.predict(padded_sequence)
predicted_class = np.argmax(prediction)
sentiment = "positive" if predicted_class == 1 else "negative"
print("The sentiment of the text is:", sentiment)
predicted_class

The sentiment of the text is: negative


0

In [44]:
text = "I was really disappointed with this and was really unhelpful."
# preprocess the text data
# text = preprocess_text(text)
text_sequence = token.texts_to_sequences([text])
padded_sequence = pad_sequences(text_sequence, maxlen=max_length)
prediction = model_gru.predict(padded_sequence)
predicted_class = np.argmax(prediction)
sentiment = "positive" if predicted_class == 1 else "negative"
print("The sentiment of the text is:", sentiment)
predicted_class

The sentiment of the text is: negative


0

# 