## Exercise 3: Part 2

*In this task, you will perform text classification on the 20 newsgroups dataset. It is a collection of e-mails coming from
different newsgroups. The goal is to assign each e-mail to its corresponding newsgroup.*

*We build a basic CNN for this task using the keras library*

In [1]:
import numpy as np
import random
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import *

In [2]:
# ------------------------------------------------
#             2.1 Creating Data Splits
# ------------------------------------------------

################################
input_file = 'data.txt'
################################

tmp_dir = '/tmp'
train_verbose = 1
pad_length = 300

In [4]:
def read_data(input_file):
    vocab = {0}
    data_x = []
    data_y = []
    with open(input_file) as f:
        for line in f:
            label, content = line.split('\t')
            content = [int(v) for v in content.split()]
            vocab.update(content)
            data_x.append(content)
            label = tuple(int(v) for v in label.split())
            data_y.append(label)

    data_x = pad_sequences(data_x, maxlen=pad_length)
    return list(zip(data_y, data_x)), vocab

In [5]:
data, vocab = read_data(input_file)
vocab_size = max(vocab) + 1

In [12]:
# random seeds
random.seed(42)
tf.random.set_seed(42)

random.shuffle(data)
input_len = len(data)


In [13]:
# train_y: a list of 20-component one-hot vectors representing newsgroups
# train_x: a list of 300-component vectors where each entry corresponds to a word ID
train_y, train_x = zip(*(data[:(input_len * 8) // 10]))
dev_y, dev_x = zip(*(data[(input_len * 8) // 10: (input_len * 9) // 10]))
test_y, test_x = zip(*(data[(input_len * 9) // 10:]))

In [14]:
# Leave those unmodified and, if requested by the task, modify them locally in the specific task
batch_size = 64
embedding_dims = 100
epochs = 2
filters = 75
kernel_size = 3  # Keras uses a different definition where a kernel size of 3 means that 3 words are convolved at each step


*Keras CNN model*

In [102]:
model = Sequential()
model.add(Embedding(vocab_size,300  ,input_length=pad_length))
model.add(Reshape((300,300,1),input_shape=trainx.shape[1:]))
model.add(Conv2D(75,kernel_size=(2,2),activation='relu',input_shape= (None,300,100,1)))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten())
model.add(Dense(20,activation='softmax'))
model.summary()

Model: "sequential_53"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_50 (Embedding)    (None, 300, 300)          25855200  
                                                                 
 reshape_11 (Reshape)        (None, 300, 300, 1)       0         
                                                                 
 conv2d_45 (Conv2D)          (None, 299, 299, 75)      375       
                                                                 
 max_pooling2d_6 (MaxPooling  (None, 149, 149, 75)     0         
 2D)                                                             
                                                                 
 flatten (Flatten)           (None, 1665075)           0         
                                                                 
 dense_4 (Dense)             (None, 20)                33301520  
                                                     

In [94]:
train_x = np.asarray(train_x)
train_y = np.asarray(train_y)
dev_x = np.asarray(dev_x)
dev_y = np.asarray(dev_y)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs, verbose=train_verbose, validation_data=(dev_x, dev_y))
print('Accuracy of simple CNN: %f\n' % model.evaluate(dev_x, dev_y, verbose=0)[1])

*After training the basic CNN on Google collab we got Accuracy on the dev set: 0.783786*

In [None]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='weights',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

earlystopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    min_delta=0,
    patience=2,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=False
)
          
model.fit(train_x, train_y, batch_size=batch_size, epochs=50, verbose=train_verbose, validation_data=(dev_x,dev_y),callbacks=[earlystopping_callback,model_checkpoint_callback])
model.load_weights('weights')
print('Accuracy of best model on dev set: %f\n' % model.evaluate(dev_x, dev_y, verbose=0)[1])
print('Accuracy of best model on test set: %f\n' % model.evaluate(test_x, test_y, verbose=0)[1])

*After applying early stopping and model checkpoint we get:*

*dev test accuracy of best model: 0.817313 Test set accuracy of best model: 0.809878*