# Foundations of Machine Learning (CS564)
## End Semester Assignment

### Creating a Vanilla RNN, FFN of Assignment 4, Pre-trained FFN (using weights of FFN in Assignment 4)
### and Ensemble Models by Majority Voting, Weighted Voting

<table style=\"font-size:25px\">
    <thead>
        <td><b>Name of Student</b></td>
        <td><b>Roll No.</b></td>
        <td><b>Date</b></td>
    </thead>
    <tr>
        <td>M. Maheeth Reddy</td>
        <td>1801CS31</td>
        <td>26-Nov-2021</td>
    </tr>
</table>

## Import Modules

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Concatenate, Flatten, Dropout, Dense
from keras.layers import SimpleRNN

from keras.models import Model
from keras.layers import Dense, Input, Embedding, Dropout
from keras.callbacks import EarlyStopping

from sklearn import metrics
from sklearn.metrics import classification_report

from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.porter import PorterStemmer
import re

## Loading Data

In [2]:
dataset = pd.read_csv("bbc.csv")
print(dataset.columns)
label_enc = LabelEncoder()
input_labels = label_enc.fit_transform(dataset['Class'])
label_enc_name_mapping = dict(zip(label_enc.classes_, label_enc.transform(label_enc.classes_)))
print(label_enc_name_mapping)
print(dataset.head())

Index(['Unnamed: 0', 'Article', 'Class'], dtype='object')
{'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}
   Unnamed: 0                                            Article     Class
0           0  Ad sales boost Time Warner profit\n\nQuarterly...  business
1           1  Dollar gains on Greenspan speech\n\nThe dollar...  business
2           2  Yukos unit buyer faces loan claim\n\nThe owner...  business
3           3  High fuel prices hit BA's profits\n\nBritish A...  business
4           4  Pernod takeover talk lifts Domecq\n\nShares in...  business


## Pre-processing data

In [3]:
porter_stemmer = PorterStemmer()
document_list = dataset['Article'].tolist()
processed_list = []
len_words = 0
for document in document_list:
    doc_str = re.sub(r'[^\w\s]', '', remove_stopwords(document))
    doc_str_stem_lst = [porter_stemmer.stem(word) for word in doc_str.split(" ")]
    len_words += len(doc_str_stem_lst)
    doc_str_stem = " ".join(doc_str_stem_lst)
    processed_list.append(doc_str_stem)

print('Pre-Processing data is finished!')
print('Average Length of each text document is {}'.format(len_words /len(processed_list)))

Pre-Processing data is finished!
Average Length of each text document is 227.56066945606693


## Tokenize Input Data

In [4]:
def get_tokenizer(data):
    print('Tokenizer is being trained, Please Wait')
    tokenizer = Tokenizer()
    print('Read {} Sentences'.format(len(data)))
    tokenizer.fit_on_texts(data)
    print('Tokenizer training done!')
    return tokenizer

def get_data(tokenizer, MAX_LENGTH, input_data, input_labels):
    print('Data is being loaded to tokenizer')
    assert len(input_data) == len(input_labels)
    sequences = tokenizer.texts_to_sequences(input_data)
    X = pad_sequences(sequences, maxlen=MAX_LENGTH)
    Y_bcc = np.array(input_labels)
    return X, Y_bcc

tokenizer = get_tokenizer(processed_list)

X, Y_bcc = get_data(tokenizer, 100, processed_list, input_labels)
y_bcc_labels = keras.utils.np_utils.to_categorical(Y_bcc)
print(y_bcc_labels)

Tokenizer is being trained, Please Wait
Read 1912 Sentences
Tokenizer training done!
Data is being loaded to tokenizer
[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]


## Split BBC News Dataset in 70:10:20 ratio

In [5]:
train_val_X, test_X, train_val_Y, test_Y = train_test_split(X, y_bcc_labels, test_size=0.2, random_state=43)
train_X, val_X, train_Y, val_Y = train_test_split(train_val_X, train_val_Y, test_size=0.1, random_state=43)

## Open a file to write accuracies of all models

In [6]:
acc_file = open('accuracies.txt','w')

## Open a file to write number of instances misclassified

In [7]:
misclass_file = open('misclassified.txt','w')

## Feedforward neural network from assignment-4 (re-implemented in this notebook)

### Defining the model

In [8]:
max_words_count = len(tokenizer.word_index) + 1

def ff_nn_initial_model():
    input_layer = Input(shape=(100, ), dtype='int32')
    embedding_layer = Embedding(max_words_count, 100, trainable=True)
    embedded_input = embedding_layer(input_layer)
    flatten_input = Flatten()(embedded_input)
    dense_layer_1 = Dense(2048, activation='tanh')
    drop_1 = Dropout(0.1)
    dense_layer_2 = Dense(1024, activation='tanh')
    drop_2 = Dropout(0.1)
    dense_layer_3 = Dense(5, activation='softmax')
    output_layer = dense_layer_3(drop_2(dense_layer_2(drop_1(dense_layer_1(flatten_input)))))
    ff_nn_initial = Model(inputs=input_layer, outputs=output_layer)
    print(ff_nn_initial.summary())
    return ff_nn_initial

### Training the model

In [9]:
stop = [EarlyStopping(monitor='val_loss', patience=1)]
ff_nn_initial = ff_nn_initial_model()
ff_nn_initial.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
history = ff_nn_initial.fit(x=train_X,y=train_Y,batch_size=512,epochs=75,validation_data=(val_X,val_Y),shuffle=True)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 100)          2253200   
_________________________________________________________________
flatten (Flatten)            (None, 10000)             0         
_________________________________________________________________
dense (Dense)                (None, 2048)              20482048  
_________________________________________________________________
dropout (Dropout)            (None, 2048)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              2098176   
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0     

Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


### Saving the FF NN Converged Model

In [10]:
ff_nn_initial.save("ff_nn_1_saved")

INFO:tensorflow:Assets written to: ff_nn_1_saved/assets


### Saving the predictions of the model

In [12]:
predictions = ff_nn_initial.predict(test_X)
y_pred_ff_nn_initial = [idx for idx in np.argmax(predictions, axis=1)]
y_true = [idx for idx in np.argmax(test_Y, axis=1)]
print('Classification Accuracy of Assignment-4 Feedforward NN is:')
print(metrics.accuracy_score(y_true, y_pred_ff_nn_initial)*100)
print(classification_report(y_true, y_pred_ff_nn_initial, target_names = list(label_enc_name_mapping.keys())))

# Saving output predictions in csv file
init_dataset = pd.DataFrame()
init_dataset['preds'] = y_pred_ff_nn_initial
init_dataset.to_csv('ff_nn_initial_preds.csv')

Classification Accuracy of Assignment-4 Feedforward NN is:
81.20104438642298
               precision    recall  f1-score   support

     business       0.95      0.68      0.79        79
entertainment       0.66      0.86      0.74        69
     politics       0.77      0.91      0.84        93
        sport       0.90      0.81      0.85        47
         tech       0.89      0.79      0.84        95

     accuracy                           0.81       383
    macro avg       0.83      0.81      0.81       383
 weighted avg       0.83      0.81      0.81       383



In [13]:
acc_file.write('Classification Accuracy of Assignment-4 Feedforward NN is: ')
acc_file.write(str(metrics.accuracy_score(y_true, y_pred_ff_nn_initial)*100)+'\n')
acc_file.write('Classification Report:\n')
acc_file.write(classification_report(y_true, y_pred_ff_nn_initial, target_names = list(label_enc_name_mapping.keys())))
acc_file.write('============================================\n')

45

## Create the feedforward neural network from weights saved above

In [14]:
ff_nn_new = keras.models.load_model("ff_nn_1_saved")
stop = [EarlyStopping(monitor='val_loss', patience=20)]
ff_nn_new.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
history = ff_nn_new.fit(x=train_X,y=train_Y,batch_size=64,epochs=50,validation_data=(val_X,val_Y),shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# Save the predictions of the model to CSV

In [15]:
predictions = ff_nn_new.predict(test_X)
y_pred_ff_nn_new = [idx for idx in np.argmax(predictions, axis=1)]
print('Classification Accuracy of Pretrained Feedforward Neural Network is:')
print(metrics.accuracy_score(y_true, y_pred_ff_nn_new)*100)
print(classification_report(y_true, y_pred_ff_nn_new, target_names = list(label_enc_name_mapping.keys())))


# Saving output predictions in csv file
ff_nn_new_dataset = pd.DataFrame()
ff_nn_new_dataset['preds'] = y_pred_ff_nn_new
ff_nn_new_dataset.to_csv('ff_nn_new_preds.csv')

Classification Accuracy of Pretrained Feedforward Neural Network is:
81.4621409921671
               precision    recall  f1-score   support

     business       0.95      0.70      0.80        79
entertainment       0.67      0.86      0.75        69
     politics       0.77      0.92      0.84        93
        sport       0.90      0.81      0.85        47
         tech       0.89      0.78      0.83        95

     accuracy                           0.81       383
    macro avg       0.84      0.81      0.82       383
 weighted avg       0.84      0.81      0.82       383



In [16]:
acc_file.write('Classification Accuracy of Pretrained Feedforward NN is: ')
acc_file.write(str(metrics.accuracy_score(y_true, y_pred_ff_nn_new)*100)+'\n')
acc_file.write('Classification Report:\n')
acc_file.write(classification_report(y_true, y_pred_ff_nn_new, target_names = list(label_enc_name_mapping.keys())))
acc_file.write('============================================\n')

45

## Vanilla RNN model

### Define the model

In [17]:
def rnn_model_create():
    input_layer = Input(shape=(100, ), dtype='int32')
    embedding_layer = Embedding(max_words_count, 100, trainable=True)
    embedded_input = embedding_layer(input_layer)
    rnn_layer_1 = SimpleRNN(64, activation = 'relu', recurrent_dropout=0.2)
    rnn_layer_2 = SimpleRNN(32, activation = 'relu', recurrent_dropout=0.2)
    drop_2 = Dropout(0.1)
    rnn_dense_connect = Dense(128, activation='tanh')
    drop_3 = Dropout(0.1)
    dense_layer_4 = Dense(5, activation='softmax')
    inter_1 = rnn_layer_1(embedded_input)
    inter_2 = rnn_layer_2(embedded_input)
    concatenated_tensor = Concatenate(axis=1)([inter_1, inter_2])
    output_layer = dense_layer_4(drop_3(rnn_dense_connect((drop_2(concatenated_tensor)))))
    rnn_model = Model(inputs=input_layer, outputs=output_layer)
    print(rnn_model.summary())
    return rnn_model

### Train the model and save the weights

In [18]:
stop = [EarlyStopping(monitor='val_loss', patience=1)]
rnn_model = rnn_model_create()
rnn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),loss='categorical_crossentropy',metrics=['accuracy'])
history = rnn_model.fit(x=train_X,y=train_Y,batch_size=256,epochs=50,validation_data=(val_X,val_Y),shuffle=True)

# Saving the RNN Model
rnn_model.save("rnn_model_1")

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 100)     2253200     input_2[0][0]                    
__________________________________________________________________________________________________
simple_rnn (SimpleRNN)          (None, 64)           10560       embedding_1[0][0]                
__________________________________________________________________________________________________
simple_rnn_1 (SimpleRNN)        (None, 32)           4256        embedding_1[0][0]                
____________________________________________________________________________________________

Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
INFO:tensorflow:Assets written to: rnn_model_1/assets


### Save the predictions to CSV

In [19]:
predictions = rnn_model.predict(test_X)
y_pred_rnn = [idx for idx in np.argmax(predictions, axis=1)]
print('Classification Accuracy of RNN is:')
print(metrics.accuracy_score(y_true, y_pred_rnn)*100)
print(classification_report(y_true, y_pred_rnn, target_names = list(label_enc_name_mapping.keys())))

# Saving output predictions in csv file
rnn_dataset = pd.DataFrame()
rnn_dataset['preds'] = y_pred_rnn
rnn_dataset.to_csv('rnn_preds.csv')

Classification Accuracy of RNN is:
65.0130548302872
               precision    recall  f1-score   support

     business       0.94      0.77      0.85        79
entertainment       0.46      0.57      0.51        69
     politics       0.86      0.71      0.78        93
        sport       0.45      0.45      0.45        47
         tech       0.57      0.65      0.61        95

     accuracy                           0.65       383
    macro avg       0.65      0.63      0.64       383
 weighted avg       0.68      0.65      0.66       383



In [20]:
acc_file.write('Classification Accuracy of RNN is: ')
acc_file.write(str(metrics.accuracy_score(y_true, y_pred_rnn)*100)+'\n')
acc_file.write('Classification Report:\n')
acc_file.write(classification_report(y_true, y_pred_rnn, target_names = list(label_enc_name_mapping.keys())))
acc_file.write('============================================\n')

45

## Majority Voting Ensemble Model

In [21]:
majority_predicts = []
for i in range(len(y_pred_rnn)):
    class_counts = {i:0 for i in range(5)}
    class_counts[y_pred_ff_nn_initial[i]] += 1
    class_counts[y_pred_ff_nn_new[i]] += 1
    class_counts[y_pred_rnn[i]] += 1
    sorted_classes = list(dict(sorted(class_counts.items(), key=lambda x:x[1], reverse=True)).keys())
    most_frequent = sorted_classes[0]
    majority_predicts.append(most_frequent)

print('Accuracy of Ensemble model by Majority Voting is')
print(metrics.accuracy_score(y_true, majority_predicts)*100)
print(classification_report(y_true, majority_predicts, target_names=list(label_enc_name_mapping.keys())))

# Save Results to CSV
maj_dataset = pd.DataFrame()
maj_dataset['preds'] = majority_predicts
maj_dataset.to_csv('ensemble_majority_preds.csv')

Accuracy of Ensemble model by Majority Voting is
81.72323759791122
               precision    recall  f1-score   support

     business       0.95      0.70      0.80        79
entertainment       0.67      0.86      0.75        69
     politics       0.77      0.92      0.84        93
        sport       0.90      0.81      0.85        47
         tech       0.89      0.79      0.84        95

     accuracy                           0.82       383
    macro avg       0.84      0.81      0.82       383
 weighted avg       0.84      0.82      0.82       383



In [22]:
acc_file.write('Classification Accuracy of Majority Voting Ensemble Model is: ')
acc_file.write(str(metrics.accuracy_score(y_true, majority_predicts)*100)+'\n')
acc_file.write('Classification Report:\n')
acc_file.write(classification_report(y_true, majority_predicts, target_names = list(label_enc_name_mapping.keys())))
acc_file.write('============================================\n')

45

## Weighted Voting Ensemble Model

### Getting Weights

In [23]:
predictions_1 = ff_nn_initial.predict(val_X)
pred_1 = [idx for idx in np.argmax(predictions_1, axis=1)]
predictions_2 = ff_nn_new.predict(val_X)
pred_2 = [idx for idx in np.argmax(predictions_2, axis=1)]
predictions_3 = rnn_model.predict(val_X)
pred_3 = [idx for idx in np.argmax(predictions_3, axis=1)]
y_val_true = [idx for idx in np.argmax(val_Y, axis=1)]

wt_ff_nn_initial = metrics.accuracy_score(y_val_true, pred_1)
wt_ff_nn_new = metrics.accuracy_score(y_val_true, pred_2)
wt_rnn = metrics.accuracy_score(y_val_true, pred_3)

print('Weight of Initial FF-NN is {}, Pretrained FF-NN is {}, RNN is {}'.format(wt_ff_nn_initial, wt_ff_nn_new, wt_rnn))

Weight of Initial FF-NN is 0.7843137254901961, Pretrained FF-NN is 0.803921568627451, RNN is 0.6601307189542484


### Predictions of model

In [24]:
weighted_predicts = []
for i in range(len(y_pred_rnn)):
    class_counts = {0:0, 1:0, 2:0, 3:0, 4:0}
    class_counts[y_pred_ff_nn_initial[i]] += wt_ff_nn_initial
    class_counts[y_pred_ff_nn_new[i]] += wt_ff_nn_new
    class_counts[y_pred_rnn[i]] += wt_rnn
    sorted_classes = list(dict(sorted(class_counts.items(), key=lambda x: x[1], reverse=True)).keys())
    most_frequent = sorted_classes[0]
    weighted_predicts.append(most_frequent)

### Accuracy of model

In [25]:
print('Accuracy of Ensemble model by Weighted Voting is')
print(metrics.accuracy_score(y_true, weighted_predicts)*100)
print(classification_report(y_true, weighted_predicts, target_names = list(label_enc_name_mapping.keys())))

Accuracy of Ensemble model by Weighted Voting is
81.72323759791122
               precision    recall  f1-score   support

     business       0.95      0.70      0.80        79
entertainment       0.67      0.86      0.75        69
     politics       0.77      0.92      0.84        93
        sport       0.90      0.81      0.85        47
         tech       0.89      0.79      0.84        95

     accuracy                           0.82       383
    macro avg       0.84      0.81      0.82       383
 weighted avg       0.84      0.82      0.82       383



In [26]:
acc_file.write('Classification Accuracy of Weighted Voting Ensemble Model is: ')
acc_file.write(str(metrics.accuracy_score(y_true, weighted_predicts)*100)+'\n')
acc_file.write('Classification Report:\n')
acc_file.write(classification_report(y_true, weighted_predicts, target_names = list(label_enc_name_mapping.keys())))
acc_file.write('============================================\n')

45

### Save results to CSV

In [27]:
wt_dataset = pd.DataFrame()
wt_dataset['preds'] = weighted_predicts
wt_dataset.to_csv('ensemble_weight_preds.csv')

## Calculating how many instances were wrongly classified by *any of/all* the three individual models, but were correctly classified by the ensemble

### Read predictions of above 5 models from CSV

In [28]:
ff_nn_initial_preds_df = pd.read_csv('ff_nn_initial_preds.csv')
ff_nn_initial_preds = ff_nn_initial_preds_df['preds']

ff_nn_new_preds_df = pd.read_csv('ff_nn_new_preds.csv')
ff_nn_new_preds = ff_nn_new_preds_df['preds']

rnn_preds_df = pd.read_csv('rnn_preds.csv')
rnn_preds = rnn_preds_df['preds']

ensemble_majority_preds_df = pd.read_csv('ensemble_majority_preds.csv')
maj_preds = ensemble_majority_preds_df['preds']

ensemble_weight_preds_df = pd.read_csv('ensemble_weight_preds.csv')
wt_preds = ensemble_weight_preds_df['preds']

### Perform desired calculations

In [29]:
maj_correct_1 = 0
wt_correct_1 = 0

maj_correct_2 = 0
wt_correct_2 = 0

y_true = [idx for idx in np.argmax(test_Y, axis=1)]
for i in range(len(maj_preds)):
    if (ff_nn_initial_preds[i] != y_true[i]) or (ff_nn_new_preds[i] != y_true[i]) or (rnn_preds[i] != y_true[i]):
        if maj_preds[i] == y_true[i]:
            maj_correct_1 += 1
        if wt_preds[i] == y_true[i]:
            wt_correct_1 += 1
    if (ff_nn_initial_preds[i] != y_true[i]) and (ff_nn_new_preds[i] != y_true[i]) and (rnn_preds[i] != y_true[i]):
        if maj_preds[i] == y_true[i]:
            maj_correct_2 += 1
        if wt_preds[i] == y_true[i]:
            wt_correct_2 += 1

print('\nNumber of instances wrongly classified by "any base model" but \ncorrectly classified by the majority vote ensemble:', maj_correct_1)
print('\nNumber of instances wrongly classified by "any base model" but \ncorrectly classified by the weighted vote ensemble:', wt_correct_1)
print('\nNumber of instances wrongly classified by "all base models" but \ncorrectly classified by the majority vote ensemble:', maj_correct_2)
print('\nNumber of instances wrongly classified by "all base models" but \ncorrectly classified by the weighted vote ensemble:', wt_correct_2)


Number of instances wrongly classified by "any base model" but 
correctly classified by the majority vote ensemble: 105

Number of instances wrongly classified by "any base model" but 
correctly classified by the weighted vote ensemble: 105

Number of instances wrongly classified by "all base models" but 
correctly classified by the majority vote ensemble: 0

Number of instances wrongly classified by "all base models" but 
correctly classified by the weighted vote ensemble: 0


In [30]:
misclass_file.write('\nNumber of instances wrongly classified by "any base model" but \ncorrectly classified by the majority vote ensemble: '+str(maj_correct_1)+'\n')
misclass_file.write('\nNumber of instances wrongly classified by "any base model" but \ncorrectly classified by the weighted vote ensemble: '+str(wt_correct_1)+'\n')
misclass_file.write('\nNumber of instances wrongly classified by "all base models" but \ncorrectly classified by the majority vote ensemble: '+str(maj_correct_2)+'\n')
misclass_file.write('\nNumber of instances wrongly classified by "all base models" but \ncorrectly classified by the weighted vote ensemble: '+str(wt_correct_2)+'\n')

120

In [31]:
acc_file.close()
misclass_file.close()