In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.utils import shuffle
from tqdm import tqdm
import bz2
from keras.layers import *
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import precision_recall_fscore_support,accuracy_score

In [2]:
def splitReviewsLabels(lines):
    reviews = []
    labels = []
    for review in tqdm(lines):
        rev = reviewToX(review)
        label = reviewToY(review)
        reviews.append(rev[:512])
        labels.append(label)
    return reviews, labels

In [3]:
def reviewToY(review):
    return [1,0] if review.split(' ')[0] == '__label__1' else [0,1]

In [4]:
def reviewToX(review):
    review = review.split(' ', 1)[1][:-1].lower()
    review = re.sub('\d','0',review)
    if 'www.' in review or 'http:' in review or 'https:' in review or '.com' in review:
        review = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", review)
    return review

In [6]:
file_path = 'E:/Backup/Project/Machine Learning/Natural language processing/'
train_file = bz2.BZ2File(file_path+'train.ft.txt.bz2')
test_file = bz2.BZ2File(file_path+'test.ft.txt.bz2')

train_lines = train_file.readlines()
test_lines = test_file.readlines()

# train_lines = [x.decode('utf-8') for x in train_lines]
# test_lines = [x.decode('utf-8') for x in test_lines]

bz2.BZ2File

In [15]:
train_lines = [x.decode('utf-8') for x in train_lines[:15000]]
test_lines = [x.decode('utf-8') for x in test_lines[:15000]]

AttributeError: 'str' object has no attribute 'decode'

In [12]:
reviews_train, y_train = splitReviewsLabels(train_lines)
reviews_test, y_test = splitReviewsLabels(test_lines)

100%|██████████| 15000/15000 [00:01<00:00, 13678.82it/s]
100%|██████████| 15000/15000 [00:00<00:00, 29182.98it/s]


In [16]:
reviews_train, y_train = shuffle(reviews_train, y_train)
reviews_test, y_test = shuffle(reviews_test, y_test)

In [17]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [22]:
max_features = 8192
maxlen = 128
embed_size = 64

In [23]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(reviews_train)

In [31]:
import pickle
with open('tokenizer.sav','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
token_train = tokenizer.texts_to_sequences(reviews_train)
token_test = tokenizer.texts_to_sequences(reviews_test)

In [25]:
x_train = pad_sequences(token_train, maxlen=maxlen, padding='post')
x_test = pad_sequences(token_test, maxlen=maxlen, padding='post')

In [26]:
input = Input(shape=(maxlen,))
net = Embedding(max_features, embed_size)(input)
net = Dropout(0.2)(net)
net = BatchNormalization()(net)

net = Conv1D(32, 7, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net1 = BatchNormalization()(net)

net = Conv1D(2, 1)(net)
net = GlobalAveragePooling1D()(net)
output = Activation('softmax')(net)
model = Model(inputs = input, outputs = output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [28]:
# model.summary()

In [29]:
model.fit(x_train, y_train, batch_size=2048, epochs=5, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x247093b33d0>

In [30]:
model.evaluate (x_test, y_test)



[0.7176415920257568, 0.5071333050727844]