In [1]:
import bz2
import re
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
import gc

In [3]:
file = bz2.BZ2File('train.ft.txt.bz2')

In [4]:
file_lines = file.readlines()

In [5]:
file_lines = [line.decode('utf-8') for line in file_lines]

In [6]:
label = [0 if y.split(' ')[0] == '__label__1' else 1 for y in file_lines]

In [7]:
review = [review.split(' ', 1)[1][:-1].lower() for review in file_lines]

In [8]:
del(file,file_lines)

In [9]:
gc.collect()

53

In [10]:
reviews = pd.DataFrame.from_dict({'reviews':review,'label':label})

In [11]:
reviews['label'].value_counts()

1    1800000
0    1800000
Name: label, dtype: int64

In [12]:
reviews['reviews'] = reviews['reviews'].apply(lambda rev: re.sub('\d','0',rev))

In [13]:
reviews['reviews'] = reviews['reviews'].apply(lambda rev: re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", rev))

In [14]:
max_features = 20000
maxlen = 100
tkz = Tokenizer(num_words=max_features)

In [15]:
tkz.fit_on_texts(reviews['reviews'])

In [16]:
tokenized_review = tkz.texts_to_sequences(reviews['reviews'])
X = pad_sequences(tokenized_review, maxlen=maxlen)

In [17]:
X[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,    76,    11,     1,
         612,  6422,     8,   178,   494,    13,   365,     6,  6068,
           1,    10,    59,   443,    28,    70,     3,    40,  1826,
           6,    76,     5,   136,    73,   677, 16920,   146,   121,
           3,    20,   525,     1,   146,  1869,    17,    41,     7,
          29,     7,     1,   602,     3,    20,   129,   525,     6,
          45,     1,    90,   121,     6,  6690,   245,    37,  5128,
           2,   421,     4, 16818,   854,    16,  9039,  3039,     2,
        4341, 19329,     6,    40,  5214,   208,    73,  2728,     5,
         336], dtype=int32)

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,reviews['label'],test_size=.33,random_state=42)

In [20]:
len(X_train)

2412000

In [21]:
len(X_test)

1188000

In [None]:
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [None]:
def create_model(max_features, maxlen):
    model = Sequential()
    model.add(Embedding(max_features, 25, input_length=maxlen))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(2, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

In [None]:
model = create_model(max_features,maxlen)

In [None]:
batch_size = 2048
epochs = 100

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
weight_path="early_weights.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)
callbacks = [checkpoint, early_stopping]

In [22]:
from keras.utils import to_categorical

In [None]:
y_train = to_categorical(y_train)

In [None]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, shuffle = True, validation_split=0.20, callbacks=callbacks)

In [23]:
from keras.models import load_model

In [24]:
model = load_model('early_weights.hdf5')




In [26]:
predictions = model.predict_classes(X_test[:10])

In [27]:
predictions

array([0, 0, 1, 0, 0, 0, 0, 0, 1, 0])

In [29]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [30]:
print(f"Accuracy: {accuracy_score(y_test[:10],predictions)}")

Accuracy: 1.0


In [31]:
print(classification_report(y_test[:10],predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00         2

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [32]:
print(confusion_matrix(y_test[:10],predictions))

[[8 0]
 [0 2]]
