In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from keras.optimizers import SGD
from sklearn.metrics import confusion_matrix, accuracy_score




In [2]:
train_data = pd.read_csv('Amazon review/train.ft.txt.bz2', compression='bz2', delimiter='\t', header=None).sample(25000, random_state=42)
test_data = pd.read_csv('Amazon review/test.ft.txt.bz2', compression='bz2', delimiter='\t', header=None).sample(25000, random_state=42)

In [3]:
train_data.head()

Unnamed: 0,0
2079998,__label__1 Expensive Junk: This product consis...
1443106,__label__1 Toast too dark: Even on the lowest ...
3463669,__label__2 Excellent imagery...dumbed down sto...
2914699,__label__1 Are we pretending everyone is marri...
1603231,__label__1 Not worth your time: Might as well ...


In [4]:
train_data.shape

(25000, 1)

In [5]:
test_data.head()

Unnamed: 0,0
23218,__label__2 This is a great book: I must prefac...
20731,__label__1 Huge Disappointment.: As a big time...
39555,__label__2 Wayne is tight but cant hang with T...
147506,__label__2 Excellent: I read this book when I ...
314215,__label__1 Not about Anusara: Although this bo...


In [6]:
def prepare_data(df):                  
    data = []                                   
    for index, row in df.iterrows():          
        line = row[0]                           
        label, text = line.split(' ', 1)        
        label = label.replace('__label__', '')  
        data.append((label, text.strip()))      
    cols = ['label', 'content'] 
                     
    return pd.DataFrame(data, columns=cols) 

In [7]:
train_data = prepare_data(train_data)
test_data = prepare_data(test_data)

In [8]:
train_data.head()

Unnamed: 0,label,content
0,1,Expensive Junk: This product consists of a pie...
1,1,"Toast too dark: Even on the lowest setting, th..."
2,2,Excellent imagery...dumbed down story: I enjoy...
3,1,Are we pretending everyone is married?: The au...
4,1,Not worth your time: Might as well just use a ...


In [9]:
train_data['label'] = train_data['label'].replace({"2":1, "1":0})
test_data['label'] = test_data['label'].replace({"2":1, "1":0})

In [10]:
train_data.head()

Unnamed: 0,label,content
0,0,Expensive Junk: This product consists of a pie...
1,0,"Toast too dark: Even on the lowest setting, th..."
2,1,Excellent imagery...dumbed down story: I enjoy...
3,0,Are we pretending everyone is married?: The au...
4,0,Not worth your time: Might as well just use a ...


In [11]:
test_data.head()

Unnamed: 0,label,content
0,1,This is a great book: I must preface this by s...
1,0,"Huge Disappointment.: As a big time, long term..."
2,1,Wayne is tight but cant hang with Turk.: This ...
3,1,Excellent: I read this book when I was in elem...
4,0,Not about Anusara: Although this book is toute...


In [12]:
train_data = train_data.dropna()
test_data = test_data.dropna()

In [13]:
X = train_data['content']
y = train_data['label']

In [14]:
X_test = test_data['content']
y_test = test_data['label']

In [15]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\win
[nltk_data]     11\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
combined_data = [X, X_test]
combined_data = pd.concat(combined_data, ignore_index=True)

In [18]:
ps = PorterStemmer()
corpus = []

for i in range(0, len(combined_data)):
    review = re.sub('[^a-zA-Z]', ' ', combined_data[i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('English')]
    review = ' '.join(review)
    corpus.append(review)

In [19]:
corpus

['expens junk product consist piec thin flexibl insul materi adhes back velcro white electr tape problem instruct three pictur littl inform velcro crumpl receiv stronger adhes tri disengag velcro piec came paint ceil white electr tape horribl cheap narrow fell less hour price ripoff build easier use cheaper attract higher r valu surpris amazon even list junk',
 'toast dark even lowest set toast dark like also light stay lit unplug avoid wast electr qualiti expect cuisinart',
 'excel imageri dumb stori enjoy disc video stun agre other stori dumb take childish approach actual seem like littl one side pro environment nevertheless enjoy would say howev amazon wmv hd disc better stori better sharper imag interest thing look',
 'pretend everyon marri author pretend parent neither die divorc insist marriag rock upon els behavior well child built send clear messag non tradit household book peopl play game life way everyon els suffer bad behavior deserv',
 'worth time might well use knife produ

In [20]:
vocab = set()
for sentence in corpus:
    vocab.update(sentence.split())  

vocab_size = len(vocab)

In [21]:
vocab_size

55400

In [22]:
onehot_conv = [one_hot(words, vocab_size) for words in corpus]
max_len = max(len(seq) for seq in onehot_conv)

In [23]:
max_len

167

In [24]:
sent_length = max_len
embedded_docs = pad_sequences(onehot_conv, padding='pre', maxlen=sent_length)
print(embedded_docs)

[[    0     0     0 ...  8123  2009 41842]
 [    0     0     0 ... 33828 14998 45258]
 [    0     0     0 ...   117  2119  5118]
 ...
 [    0     0     0 ... 32138 42491 18902]
 [    0     0     0 ... 49622  8281 46280]
 [    0     0     0 ... 35884 32604 10472]]


In [25]:
X = np.array(embedded_docs)
y = np.array(y)
y_test = np.array(y_test)

In [26]:
X.shape

(50000, 167)

In [27]:
y = np.concatenate((y, y_test))

In [28]:
y.shape

(50000,)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [30]:
X_train.shape

(25000, 167)

In [31]:
y_train.shape

(25000,)

In [32]:
def create_model(vocab_size, seq_len):
    embedding_vector_features = 128
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_vector_features, input_length = seq_len))
    model.add(LSTM(64, return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(32))
    model.add(Dense(1, activation='sigmoid')) 
    model.compile(loss='binary_crossentropy', optimizer=SGD(lr=0.01), metrics=['accuracy'])

    model.summary()

    return model

In [33]:
model = create_model(vocab_size, sent_length)






Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 167, 128)          7091200   
                                                                 
 lstm (LSTM)                 (None, 167, 64)           49408     
                                                                 
 dropout (Dropout)           (None, 167, 64)           0         
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 7153057 (27.29 MB)
Trainable params: 7153057 (27.29 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [34]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64, callbacks=[early_stopping])

Epoch 1/50












Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [35]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")



In [36]:
confusion_matrix(y_test, y_pred)

array([[10485,  2032],
       [ 1650, 10833]], dtype=int64)

In [37]:
accuracy_score(y_test, y_pred)

0.85272