**Load the training dataset**

In [801]:
import glob
import numpy as np
import tensorflow as tf

x_train = []
y_train = []

pos_path = './data/aclImdb/train/pos/*.txt'
neg_path = './data/aclImdb/train/neg/*.txt'

# Populate the training dataset
for file_path in glob.glob(pos_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        x_train.append(file.read())
        y_train.append(1)
        
for file_path in glob.glob(neg_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        x_train.append(file.read())
        y_train.append(0)

In [802]:
len(x_train)

25000

In [803]:
len(y_train)

25000

**Loading the test dataset**

In [804]:
x_train[10000]

"I really liked Tom Barman's AWTWB. You just have to let it come over you and enjoy it while it lasts, and don't expect anything. It's like sitting on a café-terrace with a beer, in the summer sun, and watching the people go by. It definitely won't keep you pondering afterwards, that's true, but that's not a prerequisite for a good film. It's just the experience during the movie that's great.<br /><br />I felt there were a few strands that could have been worked out a little more, but being a Lynch fan I don't care that much anymore :)<br /><br />And I *loved* the style, or flair of this movie. It's slick, but fresh, and the soundtrack is a beauty. Any music-lover will get his kicks out of AWTWB, I can assure you.<br /><br />I'll give it 8 out 10.<br /><br />(music-wise 10 out of 10)"

In [805]:
test_pos_path = './data/aclImdb/test/pos/*.txt'
test_neg_path = './data/aclImdb/test/neg/*.txt'

x_test = []
y_test = []

# Populate the test dataset
for file_path in glob.glob(pos_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        x_test.append(file.read())
        y_test.append(1)
        
for file_path in glob.glob(neg_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        x_test.append(file.read())
        y_test.append(0)

print(len(x_test))
print(len(y_test))

25000
25000


In [806]:
x_test[1258]

'Jon Voight plays a man named Joe. Joe is shook up by a haunting childhood. He has a strong fear and hatred of religion due to his traumatic baptism. He quits his job as a dishwasher and goes out to become a hustler for wealthy people. He meets a misfit named Ratso(Dustin Hoffman) and the two for a relationship. They go out and work together in helping each other out. They become thieves. The two grow remarkably close and soon can\'t live without each other. However, there is something very important that Ratso hasn\'t told Joe, and it could destroy any hope they have of surviving the city together. This is one of the greatest films ever made. It is a heartbreaking and shattering portrait of too very lonely men who have nothing to lose but each other. Their story is devastating to watch, but is ultimately important for people to see. It\'s one of those films where the characters are pretty much just like the seemingly crazy people you sometimes find on the street. The difference is tha

**The IMDB movie review dataset consists of 50,000 reviews which is split into 25,000 train and 25,000 test reviews. Each of the train and test sets have been equally divided into 12,500 positive and negative reviews.**

**The reviews have unnecessary html tags and punctuations present which will be removed by using regular expressions.** 

In [807]:
import re

def cleanup_text(reviews):
    cleaned_reviews = []
    
    for review in reviews:
        review = re.sub(r'<.*?>', '', review)  # remove html tags first
        cleaned_text = re.sub(r'[^\w\s]', '', review)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # remove whitespace
        cleaned_text = cleaned_text.lower() # convert to lowercase
        cleaned_reviews.append(cleaned_text)
        
    return cleaned_reviews

x_train_cleaned = cleanup_text(x_train)
x_test_cleaned = cleanup_text(x_test)
        
    
x_test_cleaned[1258]

'jon voight plays a man named joe joe is shook up by a haunting childhood he has a strong fear and hatred of religion due to his traumatic baptism he quits his job as a dishwasher and goes out to become a hustler for wealthy people he meets a misfit named ratsodustin hoffman and the two for a relationship they go out and work together in helping each other out they become thieves the two grow remarkably close and soon cant live without each other however there is something very important that ratso hasnt told joe and it could destroy any hope they have of surviving the city together this is one of the greatest films ever made it is a heartbreaking and shattering portrait of too very lonely men who have nothing to lose but each other their story is devastating to watch but is ultimately important for people to see its one of those films where the characters are pretty much just like the seemingly crazy people you sometimes find on the street the difference is that this film is from thei

**We will perform Tokenization using NLTK to split the review into words and then perform stop word removal to get rid of words that add no meaning to the reviews. We use the RegexTokenizer available as part of the NLTK library to extract only words.**

In [808]:
# Tokenize x_train_cleaned and x_test_cleaned and remove stopwords from both
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = stopwords.words('english')

tokenizer = RegexpTokenizer(r'\w+')

X_train = []
X_test = []

for text in x_train_cleaned:
    tokens = tokenizer.tokenize(text)
    final_tokens = [tk for tk in tokens if tk not in stop_words]

    # Construct a sentence with the tokens and store in a new training dataset
    final_text = " ".join(final_tokens)
    X_train.append(final_text)
    

for text in x_test_cleaned:
    tokens = tokenizer.tokenize(text)
    final_tokens = [tk for tk in tokens if tk not in stop_words]

    # Construct a sentence with the tokens and store in a new test dataset
    final_text = " ".join(final_tokens)
    X_test.append(final_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prero\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [809]:
X_train[2248]

'shift outlook neccesary enjoy modern british films one somehow allows seen right qualities rather criteria american films judged britfilm try hard gritty finds hard make warmth british films lord otherwise overwhelming competitorthis film fails content attaching predeccesor allowing easily seen work star director somewhere near end tethers couple decades later gregory teaching time two girls mind teaches school railing human rights abuses students hes fired find abuses midst must face whether hes talkthis subversive film theres usual worldly character american movie expect whatever naive man boy may still put everything line principles maybe certainly protestbynumbers though warm us film may seem realistic theyre urban gritty british films recent years dont try match america visceral thrills real british humour reveals truths'

In [810]:
len(X_train), len(X_test), len(y_train), len(y_test)

(25000, 25000, 25000, 25000)

In [811]:
X_train[1000]

'must see documentary missed opportunity 2004 definitely going watch repeat really sympathised main character film true milder condition skin problem dystrophic epidermolysis bullosa eb sad sometimes amusing emotional documentary boy terrible skin disorder jonny kennedy speaks like kid wasting vocal muscle never went puberty 36 years old sympathising moments seeing terrible condition pealing bandages jonny quite naughty sense humour even narrated beyond grave showing body coffin tells story help mother edna kennedy older brother celebrity model jonnys supporter nell mcandrew baftas best editing best new director factual nominated best sound factual flaherty documentary award number 10 100 greatest tv treats 2004 must see documentary'

In [812]:
total = 0
max_len = 0
for review in X_train:
    words = review.split()
    num_words = len(words)
    if num_words > max_len:
        max_len = num_words
    total += num_words
    
avg_num_words = total/len(X_train)
print(f"Average length of review: {avg_num_words} and Max length of a review in the dataset: {max_len}")

Average length of review: 121.18392 and Max length of a review in the dataset: 1429


In [813]:
tok = keras.preprocessing.text.Tokenizer()
tok.fit_on_texts(X_train)
X_train = tok.texts_to_sequences(X_train)
X_test = tok.texts_to_sequences(X_test)

**The maximum number of words in one review is 1429 and the average is 121. Since all reviews are of variable lengths, we need to truncate or pad the sequences uniformly in order to have reviews of the same length, that will eventually be fed into a CNN. We will use the pad_sequences function from Keras to standardize the lengths of the reviews.**

In [814]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_seq_len = 1000
X_train = keras.preprocessing.sequence.pad_sequences(X_train,padding='post',maxlen=max_seq_len)
X_test = keras.preprocessing.sequence.pad_sequences(X_test,padding='post',maxlen=max_seq_len)
X_train[1258]

print('X_train shape:', X_train.shape) # (n_samples, n_timesteps)
print('X_test shape:', X_test.shape)

X_train shape: (25000, 1000)
X_test shape: (25000, 1000)


In [815]:
y_train[12499], y_train[12500]

(1, 0)

In [816]:
# Convert y_train and y_test into numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

**Split the whole preprocessed training dataset into training and validation sets. Training set will be 80% and validation set will be 20% of the preprocessed data of 25,000 reviews.**

In [817]:
# Split the whole training dataset into training and validation sets
from sklearn.model_selection import train_test_split
X_train2, X_validation2, y_train2, y_validation2 = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

print(f"X_train2.shape: {X_train2.shape}, X_validation2.shape: {X_validation2.shape}")
print(f"y_train2.shape: {y_train2.shape}, y_validation2.shape: {y_validation2.shape}")

X_train2.shape: (20000, 1000), X_validation2.shape: (5000, 1000)
y_train2.shape: (20000,), y_validation2.shape: (5000,)


**We will be building a model by creating a Convolutional Neural Network for text classification.**

In [818]:
vocab_size = len(tok.word_index) # stores the number of unique words
print(f"The number of unique words: {vocab_size}")

The number of unique words: 142016


In [819]:
model = keras.Sequential()
# Input layer of total vocabulary, each feature is a 16 dimensional vector 
model.add(keras.layers.Embedding(vocab_size+1, 16, input_length=1000))  # add 1 to record for unknown words at index 0 
model.add(keras.layers.Dropout(0.1))
model.add(keras.layers.Conv1D(filters=16,kernel_size=2,padding='same',activation='relu'))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dropout(0.15))
model.add(keras.layers.Dense(64, activation='tanh'))
model.add(keras.layers.Dropout(0.15))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_31"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_31 (Embedding)    (None, 1000, 16)          2272272   
                                                                 
 dropout_90 (Dropout)        (None, 1000, 16)          0         
                                                                 
 conv1d_31 (Conv1D)          (None, 1000, 16)          528       
                                                                 
 global_average_pooling1d_10  (None, 16)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_91 (Dropout)        (None, 16)                0         
                                                                 
 dense_62 (Dense)            (None, 64)                1088      
                                                     

In [820]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [821]:
model.fit(X_train2, y_train2,
            epochs=20,
            validation_data=(X_validation2, y_validation2),
            verbose=1,
            batch_size=512)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x216b08bf430>

In [822]:
y_pred = model.predict(X_test)
print(y_pred)
y_pred = y_pred.reshape(-1,)
y_pred_binary = np.round(y_pred).astype(int)
y_pred_binary

[[6.8352187e-01]
 [9.9971747e-01]
 [9.8498583e-01]
 ...
 [1.7016874e-04]
 [2.0691049e-03]
 [1.9067285e-05]]


array([1, 1, 1, ..., 0, 0, 0])

In [823]:
y_test

array([1, 1, 1, ..., 0, 0, 0])

In [824]:
print(f"Test Accuracy is: {accuracy_score(y_test, y_pred_binary) * 100}")

Test Accuracy is: 97.312
