In [1]:
# # Loading drive
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
import tensorflow as tf
print(tf.__version__)

2.4.0


In [4]:
# Downloading the data
!wget https://www.dropbox.com/s/tp3l54tnatvbldf/bbc.csv?dl=0 -O 'bbc.csv'

--2021-11-25 17:18:17--  https://www.dropbox.com/s/tp3l54tnatvbldf/bbc.csv?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.18, 2620:100:6018:18::a27d:312
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/tp3l54tnatvbldf/bbc.csv [following]
--2021-11-25 17:18:18--  https://www.dropbox.com/s/raw/tp3l54tnatvbldf/bbc.csv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uca6033750e487dff2848fd643c3.dl.dropboxusercontent.com/cd/0/inline/BaqWv5Uhy6KYTQh_K1hvyTG0Bfce1Xwpt2VYNizjUW6f4GoNxr2os8rITKN6LAaKiMI_XjZANLWqmHOwVseDMEffddQTFu0CWSb6Q29snCUO2ILjU6t-3AUO6hKg_wKJUmATucGO7zli5csIG_UfjHm_/file# [following]
--2021-11-25 17:18:18--  https://uca6033750e487dff2848fd643c3.dl.dropboxusercontent.com/cd/0/inline/BaqWv5Uhy6KYTQh_K1hvyTG0Bfce1Xwpt2VYNizjUW6f4GoNxr2os8rITKN6LAaKiMI_XjZANLWqmHOwVseDMEffddQTFu0CWSb6

In [5]:
# All general imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split 

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Reshape, Conv2D, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Bidirectional, GlobalAveragePooling1D, GRU, GlobalMaxPooling1D, concatenate
from tensorflow.keras.optimizers import Adam
from keras.layers import SimpleRNN, LSTM, GRU, Conv1D, MaxPool1D, Activation, Add

from keras.models import Model, Sequential
from keras.layers.core import SpatialDropout1D

from keras.layers import Dense, Input, Embedding, Dropout, Activation, Conv1D, Softmax
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K

from keras.callbacks import EarlyStopping

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score

import matplotlib.pyplot as plt
import io, os, gc

In [6]:
# loading data
df = pd.read_csv("bbc.csv")
print(df.columns)
le = LabelEncoder()
input_labels = le.fit_transform(df['Class'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)
print(df.head())

Index(['Unnamed: 0', 'Article', 'Class'], dtype='object')
{'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}
   Unnamed: 0                                            Article     Class
0           0  Ad sales boost Time Warner profit\n\nQuarterly...  business
1           1  Dollar gains on Greenspan speech\n\nThe dollar...  business
2           2  Yukos unit buyer faces loan claim\n\nThe owner...  business
3           3  High fuel prices hit BA's profits\n\nBritish A...  business
4           4  Pernod takeover talk lifts Domecq\n\nShares in...  business


In [7]:
# Pre-processing data
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.porter import PorterStemmer
import re
porter_stemmer = PorterStemmer()
doc_list = df['Article'].tolist()
processed_list = []
len_words = 0
for doc in doc_list:
  doc_str = re.sub(r'[^\w\s]', '', remove_stopwords(doc))
  doc_str_stem_lst = [porter_stemmer.stem(word) for word in doc_str.split(" ")]
  len_words += len(doc_str_stem_lst)
  doc_str_stem = " ".join(doc_str_stem_lst)
  processed_list.append(doc_str_stem)
avg_len_text = len_words / len(processed_list)
print('Data Pre-Processed!')
print('Average Length of each text document is', avg_len_text)

Data Pre-Processed!
Average Length of each text document is 227.56066945606693


In [8]:
# Defining the tokenizer
def get_tokenizer(data):
  print('Training tokenizer...')
  tokenizer = Tokenizer()
  print('Read {} Sentences'.format(len(data)))
  tokenizer.fit_on_texts(data)
  return tokenizer

In [9]:
def get_data(tokenizer, MAX_LENGTH, input_data, input_labels):
  print('Loading data')
  
  assert len(input_data) == len(input_labels)
  sequences = tokenizer.texts_to_sequences(input_data)
  X = pad_sequences(sequences, maxlen=MAX_LENGTH)
  Y_bcc = np.array(input_labels)

  return X, Y_bcc

In [10]:
tokenizer = get_tokenizer(processed_list)

Training tokenizer...
Read 1912 Sentences


In [11]:
MAX_LENGTH = 100
# read ml data
X, Y_bcc = get_data(tokenizer, MAX_LENGTH, processed_list, input_labels)

Loading data


In [12]:
# Creating one-hot encodings
y_bcc_labels = keras.utils.np_utils.to_categorical(Y_bcc)
print(y_bcc_labels)

[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]


In [13]:
# Splitting data into train, val and test
train1_X, test_X, train1_Y, test_Y = train_test_split(X, y_bcc_labels, test_size=0.2, random_state=43)
train_X, val_X, train_Y, val_Y = train_test_split(train1_X, train1_Y, test_size=0.1, random_state=43)

In [14]:
# Setting hyper-parameters
NUM_CLASSES = 5

MAX_SEQUENCE_LENGTH = MAX_LENGTH

MAX_NUM_WORDS = len(tokenizer.word_index) + 1

NUM_EMBEDDING_DIM = 100

In [15]:
# Defining the model
def get_ff_nn_model():
  print('Getting Text FF NN')
  input_layer = Input(
      shape=(MAX_SEQUENCE_LENGTH, ), 
      dtype='int32')
  embedding_layer = Embedding(MAX_NUM_WORDS, NUM_EMBEDDING_DIM, trainable=True)
  embedded_input = embedding_layer(input_layer)
  flatten_input = Flatten()(embedded_input)
  dense_layer_1 = Dense(2048, activation='tanh')
  drop_1 = Dropout(0.1)
  dense_layer_2 = Dense(1024, activation='tanh')
  drop_2 = Dropout(0.1)
  dense_layer_3 = Dense(512, activation='tanh')
  drop_3 = Dropout(0.1)
  dense_layer_4 = Dense(5, activation='softmax')
  # output_layer = dense_layer_4(drop_3(dense_layer_3(drop_2(dense_layer_2(drop_1(dense_layer_1(flatten_input)))))))
  output_layer = dense_layer_4(drop_2(dense_layer_2(drop_1(dense_layer_1(flatten_input)))))
  initial_ff_nn = Model(
      inputs=input_layer, 
      outputs=output_layer)
  print(initial_ff_nn.summary())
  return initial_ff_nn

In [16]:
BATCH_SIZE = 512
NUM_EPOCHS = 50
stop = [EarlyStopping(monitor='val_loss', patience=1)]
initial_ff_nn = get_ff_nn_model()
initial_ff_nn.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])
history = initial_ff_nn.fit(x=train_X,
                    y=train_Y,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(
                      val_X, 
                      val_Y
                    ),
                    shuffle=True,
#                    callbacks=stop,
          )

Getting Text FF NN
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 100)          2253200   
                                                                 
 flatten (Flatten)           (None, 10000)             0         
                                                                 
 dense (Dense)               (None, 2048)              20482048  
                                                                 
 dropout (Dropout)           (None, 2048)              0         
                                                                 
 dense_1 (Dense)             (None, 1024)              2098176   
                                                                 
 dropout_1 (Dropout)         (None, 1024) 

In [17]:
# Saving the FF NN Converged Model
initial_ff_nn.save("converged_ff_nn_1")

INFO:tensorflow:Assets written to: converged_ff_nn_1/assets


In [18]:
pre_train_ff_nn = keras.models.load_model("converged_ff_nn_1")
BATCH_SIZE = 64
NUM_EPOCHS = 50
stop = [EarlyStopping(monitor='val_loss', patience=20)]
pre_train_ff_nn.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])
history = pre_train_ff_nn.fit(x=train_X,
                    y=train_Y,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(
                      val_X, 
                      val_Y
                    ),
                    shuffle=True,
#                    callbacks=stop,
          )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [19]:
# Defining the Vanilla RNN model
def get_rnn_model():
  print('RNN Model')
  input_layer = Input(
      shape=(MAX_SEQUENCE_LENGTH, ), 
      dtype='int32')
  embedding_layer = Embedding(MAX_NUM_WORDS, NUM_EMBEDDING_DIM, trainable=True)
  embedded_input = embedding_layer(input_layer)
  rnn_layer_1 = SimpleRNN(64, activation = 'relu', recurrent_dropout=0.2)
  rnn_layer_2 = SimpleRNN(32, activation = 'relu', recurrent_dropout=0.2)
  drop_2 = Dropout(0.1)
  rnn_dense_connect = Dense(128, activation='tanh')
  drop_3 = Dropout(0.1)
  dense_layer_4 = Dense(5, activation='softmax')
  # output_layer = dense_layer_4(drop_3(rnn_dense_connect(drop_2(rnn_layer_2(rnn_layer_1(embedded_input))))))
  # intermediate_layer = Reshape((1, 512))(rnn_layer_1(embedded_input))
  # output_layer = dense_layer_4(drop_3(rnn_layer_2(intermediate_layer)))
  inter_1 = rnn_layer_1(embedded_input)
  inter_2 = rnn_layer_2(embedded_input)
  concatenated_tensor = Concatenate(axis=1)([inter_1, inter_2])
  output_layer = dense_layer_4(drop_3(rnn_dense_connect((drop_2(concatenated_tensor)))))
  rnn_model = Model(
      inputs=input_layer, 
      outputs=output_layer)
  print(rnn_model.summary())
  return rnn_model

In [20]:
BATCH_SIZE = 256
NUM_EPOCHS = 50
stop = [EarlyStopping(monitor='val_loss', patience=1)]
rnn_model = get_rnn_model()
rnn_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='categorical_crossentropy',
    metrics=['accuracy'])
history = rnn_model.fit(x=train_X,
                    y=train_Y,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(
                      val_X, 
                      val_Y
                    ),
                    shuffle=True,
#                    callbacks=stop,
          )

Getting Text Vanilla RNN Model
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 100, 100)     2253200     ['input_2[0][0]']                
                                                                                                  
 simple_rnn (SimpleRNN)         (None, 64)           10560       ['embedding_1[0][0]']            
                                                                                                  
 simple_rnn_1 (SimpleRNN)       (None, 32)           4256        ['embedding_1[0][0]']            
                                                             

In [21]:
# Saving the RNN Model
rnn_model.save("rnn_model_1")

INFO:tensorflow:Assets written to: rnn_model_1/assets


In [22]:
# Getting predictions
from sklearn import metrics
from sklearn.metrics import classification_report

In [23]:
# FF NN - 1 Predictions
predictions = initial_ff_nn.predict(test_X)
y_pred_initial_ff_nn = [idx for idx in np.argmax(predictions, axis=1)]
y_true = [idx for idx in np.argmax(test_Y, axis=1)]
print('FF NN - 1BBC News Classification Accuracy is')
print(metrics.accuracy_score(y_true, y_pred_initial_ff_nn)*100)
print(classification_report(y_true, y_pred_initial_ff_nn, target_names = list(le_name_mapping.keys())))

FF NN - 1BBC News Classification Accuracy is
79.63446475195822
               precision    recall  f1-score   support

     business       0.85      0.84      0.84        79
entertainment       0.61      0.96      0.75        69
     politics       0.87      0.83      0.85        93
        sport       1.00      0.62      0.76        47
         tech       0.85      0.71      0.77        95

     accuracy                           0.80       383
    macro avg       0.83      0.79      0.79       383
 weighted avg       0.83      0.80      0.80       383



In [24]:
# FF NN - 2 Predictions
predictions = pre_train_ff_nn.predict(test_X)
y_pred_pre_trained_ff_nn = [idx for idx in np.argmax(predictions, axis=1)]
print('FF NN - 2 BBC News Classification Accuracy is')
print(metrics.accuracy_score(y_true, y_pred_pre_trained_ff_nn)*100)
print(classification_report(y_true, y_pred_pre_trained_ff_nn, target_names = list(le_name_mapping.keys())))

FF NN - 2 BBC News Classification Accuracy is
79.63446475195822
               precision    recall  f1-score   support

     business       0.85      0.84      0.84        79
entertainment       0.61      0.96      0.75        69
     politics       0.87      0.83      0.85        93
        sport       1.00      0.62      0.76        47
         tech       0.85      0.71      0.77        95

     accuracy                           0.80       383
    macro avg       0.83      0.79      0.79       383
 weighted avg       0.83      0.80      0.80       383



In [25]:
# RNN Predictions
predictions = rnn_model.predict(test_X)
y_pred_rnn = [idx for idx in np.argmax(predictions, axis=1)]
print('RNN BBC News Classification Accuracy is')
print(metrics.accuracy_score(y_true, y_pred_rnn)*100)
print(classification_report(y_true, y_pred_rnn, target_names = list(le_name_mapping.keys())))

RNN BBC News Classification Accuracy is
66.84073107049609
               precision    recall  f1-score   support

     business       0.81      0.72      0.77        79
entertainment       0.45      0.43      0.44        69
     politics       0.74      0.78      0.76        93
        sport       0.51      0.68      0.58        47
         tech       0.75      0.67      0.71        95

     accuracy                           0.67       383
    macro avg       0.65      0.66      0.65       383
 weighted avg       0.68      0.67      0.67       383



In [26]:
# Saving output predictions in csv file
init_df = pd.DataFrame()
init_df['predictions'] = y_pred_initial_ff_nn
init_df.to_csv('initial_ff_nn.csv')

pre_train_df = pd.DataFrame()
pre_train_df['predictions'] = y_pred_pre_trained_ff_nn
pre_train_df.to_csv('pre_trained_ff_nn.csv')

rnn_df = pd.DataFrame()
rnn_df['predictions'] = y_pred_rnn
rnn_df.to_csv('rnn.csv')

In [27]:
# Implementing Max Voting

max_vote_predictions = []
for i in range(len(y_pred_rnn)):
  class_counts = {0:0, 1:0, 2:0, 3:0, 4:0}
  class_counts[y_pred_initial_ff_nn[i]] += 1
  class_counts[y_pred_pre_trained_ff_nn[i]] += 1
  class_counts[y_pred_rnn[i]] += 1
  sorted_dict = dict(sorted(class_counts.items(), key=lambda item: item[1], reverse=True))
  sorted_classes = list(sorted_dict.keys())
  most_frequent = sorted_classes[0]
  max_vote_predictions.append(most_frequent)

In [28]:
# Max Vote Results
print('MAX VOTE Ensemble BBC News Classification Accuracy is')
print(metrics.accuracy_score(y_true, max_vote_predictions)*100)
print(classification_report(y_true, max_vote_predictions, target_names = list(le_name_mapping.keys())))

MAX VOTE Ensemble BBC News Classification Accuracy is
79.63446475195822
               precision    recall  f1-score   support

     business       0.85      0.84      0.84        79
entertainment       0.61      0.96      0.75        69
     politics       0.87      0.83      0.85        93
        sport       1.00      0.62      0.76        47
         tech       0.85      0.71      0.77        95

     accuracy                           0.80       383
    macro avg       0.83      0.79      0.79       383
 weighted avg       0.83      0.80      0.80       383



In [29]:
# Weighited Voting

# Getting weights
predictions_1 = initial_ff_nn.predict(val_X)
pred_1 = [idx for idx in np.argmax(predictions_1, axis=1)]
predictions_2 = pre_train_ff_nn.predict(val_X)
pred_2 = [idx for idx in np.argmax(predictions_2, axis=1)]
predictions_3 = rnn_model.predict(val_X)
pred_3 = [idx for idx in np.argmax(predictions_3, axis=1)]
y_val_true = [idx for idx in np.argmax(val_Y, axis=1)]
wt_initial_ff_nn = metrics.accuracy_score(y_val_true, pred_1)
wt_pre_trained_ff_nn = metrics.accuracy_score(y_val_true, pred_2)
wt_rnn = metrics.accuracy_score(y_val_true, pred_3)
print('FF NN 1 weight is', wt_initial_ff_nn)
print('FF NN 2 weight is', wt_pre_trained_ff_nn)
print('RNN weight is', wt_rnn)

FF NN 1 weight is 0.7843137254901961
FF NN 2 weight is 0.7843137254901961
RNN weight is 0.6209150326797386


In [30]:
# Implement Weighted Voting
weight_vote_predictions = []
for i in range(len(y_pred_rnn)):
  class_counts = {0:0, 1:0, 2:0, 3:0, 4:0}
  class_counts[y_pred_initial_ff_nn[i]] += wt_initial_ff_nn
  class_counts[y_pred_pre_trained_ff_nn[i]] += wt_pre_trained_ff_nn
  class_counts[y_pred_rnn[i]] += wt_rnn
  sorted_dict = dict(sorted(class_counts.items(), key=lambda item: item[1], reverse=True))
  sorted_classes = list(sorted_dict.keys())
  most_frequent = sorted_classes[0]
  weight_vote_predictions.append(most_frequent)

In [31]:
# Weighted Vote Results
print('Weighted Ensemble BBC News Classification Accuracy is')
print(metrics.accuracy_score(y_true, weight_vote_predictions)*100)
print(classification_report(y_true, weight_vote_predictions, target_names = list(le_name_mapping.keys())))

Weighted Ensemble BBC News Classification Accuracy is
79.63446475195822
               precision    recall  f1-score   support

     business       0.85      0.84      0.84        79
entertainment       0.61      0.96      0.75        69
     politics       0.87      0.83      0.85        93
        sport       1.00      0.62      0.76        47
         tech       0.85      0.71      0.77        95

     accuracy                           0.80       383
    macro avg       0.83      0.79      0.79       383
 weighted avg       0.83      0.80      0.80       383



In [32]:
# Converting to CSV 

mv_df = pd.DataFrame()
mv_df['predictions'] = max_vote_predictions
mv_df.to_csv('max_vote.csv')

wt_df = pd.DataFrame()
wt_df['predictions'] = weight_vote_predictions
wt_df.to_csv('weight_vote.csv')

In [33]:
# Reading data again
init_df = pd.read_csv('initial_ff_nn.csv')
init_preds = init_df['predictions']
pre_train_df = pd.read_csv('pre_trained_ff_nn.csv')
pre_train_preds = pre_train_df['predictions']
rnn_df = pd.read_csv('rnn.csv')
rnn_preds = rnn_df['predictions']
mv_df = pd.read_csv('max_vote.csv')
mv_preds = mv_df['predictions']
wt_df = pd.read_csv('weight_vote.csv')
wt_preds = wt_df['predictions']

In [34]:
wt_preds

0      4
1      4
2      4
3      2
4      0
      ..
378    1
379    4
380    0
381    2
382    0
Name: predictions, Length: 383, dtype: int64

In [35]:
# Getting Performance Statistics
mv_ens_correct_1 = 0
wt_ens_correct_1 = 0

mv_ens_correct_2 = 0
wt_ens_correct_2 = 0

y_true = [idx for idx in np.argmax(test_Y, axis=1)]
for i in range(len(mv_preds)):
  if (init_preds[i] != y_true[i]) or (pre_train_preds[i] != y_true[i]) or (rnn_preds[i] != y_true[i]):
    if mv_preds[i] == y_true[i]:
      mv_ens_correct_1 += 1
    if wt_preds[i] == y_true[i]:
      wt_ens_correct_1 += 1
  if (init_preds[i] != y_true[i]) and (pre_train_preds[i] != y_true[i]) and (rnn_preds[i] != y_true[i]):
    if mv_preds[i] == y_true[i]:
      mv_ens_correct_2 += 1
    if wt_preds[i] == y_true[i]:
      wt_ens_correct_2 += 1

In [36]:
print('Number of instances wrongly classified by a base model but correctly classified by the ensemble')
print(mv_ens_correct_1)
print(wt_ens_correct_1)

Number of instances wrongly classified by a base model but correctly classified by the ensemble
88
88


In [37]:
print('Number of instances wrongly classified by all base models but correctly classified by the ensemble')
print(mv_ens_correct_2)
print(wt_ens_correct_2)

Number of instances wrongly classified by all base models but correctly classified by the ensemble
0
0
