In [165]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import joblib
import requests
import zipfile
import io
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn import svm
from tensorflow.keras.layers import Input,Dense, Dropout, Activation, Flatten, Embedding, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, LSTM, SimpleRNN, Reshape
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from keras.layers import Concatenate
from keras.utils import np_utils
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from keras.utils import to_categorical




## Part 1 - Guardian Dataset

Create a random 20% portion of the dataset to work with. Next, split off a test set (10% of the extract) and a validation set (10% of the extract). The remaining 80% of the extract is be training data.

In [166]:
# Download the zip file from a URL
url = 'https://storage.googleapis.com/kaggle-data-sets/2315156/3897165/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20230508%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20230508T172037Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=b9451b232c263533e2801719c94c9229339f16dbe04becc0c1502f235a3976fb6aa3b7f86f859252046e34b30e40eceea506761694ccab7a0127f4c0083e3fabe55d344fd85ea03d9783363d71af50057cca34db61369c2a0c2cc04afa1eede44c2a6faa220319360d3e37b85ec399d4853c4fc7178bf0b0abf68054e7fd1d58770d112b8f9a9cd417fbb01ece50c39f35116319ef1c76ea860b323905e1ad1d54312ef8fce9b5252dbaec6d3df65a3ca4235854598b00061e6d1ccaa7e327b758f8e7400fb64d8c5954e665ef459341cc0ab3484c14d82d5cde3e4ab8e61d4e7e229cae0dec2d7dee9e46750521dc77be012089df8bec45390b4eece3fd80cb'
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))

# Extract the files to a directory
z.extractall('datasets')

In [167]:
# Load the dataset into a variable called "data"
data = pd.read_csv('datasets/guardian_articles.csv')
data.head()

Unnamed: 0,article_id,sectionName,webTitle,webUrl,bodyContent,webPublicationDate,id
0,us-news/2016/jan/31/iowa-caucus-underdog-candi...,US news,Iowa underdogs put on brave faces despite all ...,https://www.theguardian.com/us-news/2016/jan/3...,As polling day looms and the cameras turn only...,2016-01-31T23:53:37Z,1
1,us-news/2016/jan/31/iowa-caucus-worlds-most-pa...,US news,Iowa caucus: hologram eagle and Jesus star on ...,https://www.theguardian.com/us-news/2016/jan/3...,"In Des Moines on Sunday, the Guardian was give...",2016-01-31T23:46:28Z,2
2,world/2016/jan/31/tanzania-britsh-helicopter-p...,World news,British pilot in Tanzania 'manoeuvred ​to save...,https://www.theguardian.com/world/2016/jan/31/...,A British pilot who was shot dead by an elepha...,2016-01-31T23:43:48Z,3
3,football/2016/jan/31/late-winner-gets-usa-off-...,Football,USA 3-2 Iceland | International friendly match...,https://www.theguardian.com/football/2016/jan/...,USA took a step toward shaking off the ghosts ...,2016-01-31T23:30:49Z,4
4,football/2016/jan/31/blackburn-paul-lambert-ox...,Football,Reinvigorated Paul Lambert reflects after impr...,https://www.theguardian.com/football/2016/jan/...,"The clean-shaven, spectacle free and suspiciou...",2016-01-31T22:30:10Z,5


In [None]:
#Drop Columns
data = data.drop(['article_id', 'webUrl','webPublicationDate', 'id' ], axis=1)
extract = data.sample(frac=0.1)

In [None]:
def preprocess_text(text):

    # Convert the text to a string
    text = str(text)
    
    # Convert text to lowercase
    text = text.lower()

    # Remove unwanted characters using regular expressions
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the words back into a single string
    preprocessed_text = ' '.join(words)

    return preprocessed_text


# Preprocess the text extract in the extractframe
extract['sectionName'] = extract['sectionName'].apply(preprocess_text)
extract['webTitle'] = extract['webTitle'].apply(preprocess_text)
extract['bodyContent'] = extract['bodyContent'].apply(preprocess_text)




### Splitting the Truncated dataset into Training, Validation and Test sets

In [None]:
# Tokenizing text and Pad sequences to ensure equal length
max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(extract['bodyContent'].values)
X = tokenizer.texts_to_sequences(extract['bodyContent'].values)
X = pad_sequences(X) 

In [None]:
# Create a dictionary to map the labels to numerical values
label_dict = {label: index for index, label in enumerate(extract['sectionName'].unique())}
y = [label_dict[label] for label in extract['sectionName']]
y = np.array(y)

In [None]:
# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

Check shape of training, test and validation sets after split top ensure correct split

In [None]:
print('Train shape:', X_train.shape)
print('Test shape:', X_test.shape)
print('Val shape:', X_val.shape)

## RNN Variants (Basic RNN, LSTM and Multi-Layer LSTM)

In [None]:
# Hyperparameters that are used to define the architecture and settings of the RNN model
max_features = 5000  #maximum number of words to keep based on word frequency
maxlen = 400 #maximum number of words in a single sentence.
embedding_dims = 16 #dimensionality of the output space
epochs = 5 #iterations

# Preprocess by padding the sequences to the same length 
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
X_val = sequence.pad_sequences(X_val, maxlen=maxlen)

print('Train shape:', X_train.shape)
print('Test shape:', X_test.shape)
print('Validation shape:', X_val.shape)

print(type(X_train))


The code  converts the integer-encoded sequences to strings using the sequences_to_texts() method. Then, it splits each string into a list of words using the split() method so it can be accepted into the fit_on_texts method. This entire process converts the raw text input into a numerical format that can be processed by a RNN model.

### Basic RNN with single layer (No Embeddings Layer)

In [None]:
# Define the RNN model architecture
modelRNN = tf.keras.Sequential()
modelRNN.add(tf.keras.layers.Reshape((X_train.shape[1], 1), input_shape=(X_train.shape[1],)))
modelRNN.add(tf.keras.layers.Dropout(0.2))
modelRNN.add(tf.keras.layers.SimpleRNN(embedding_dims))
modelRNN.add(tf.keras.layers.Dropout(0.2))
modelRNN.add(tf.keras.layers.Dense(len(label_dict), activation='softmax'))

modelRNN.summary()

# Compile the modelRNN
modelRNN.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the modelRNN and save the history of accuracy and loss during training
historymodelRNN = modelRNN.fit(X_train, y_train,
                    epochs=5,
                    batch_size=32,
                    validation_data=(X_val, y_val))



In [None]:
modelRNN.save("/Users/tobi/SavedModels/modelRNN.keras")

### Simple RNN Evaluation

In [None]:
loss,accuracy = modelRNN.evaluate(X_test, y_test)

print("Test Loss:" , loss)
print("Test Accuracy:" , accuracy)



In [None]:
# Plot the training and validation accuracy
plt.plot(historymodelRNN.history['accuracy'])
plt.plot(historymodelRNN.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot the training and validation loss
plt.plot(historymodelRNN.history['loss'])
plt.plot(historymodelRNN.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

### Basic LSTM with single layer (No Embeddings Layer)

In [None]:
# Define the LSTM model architecture
modelLSTM = tf.keras.Sequential()
modelLSTM.add(tf.keras.layers.Reshape((X_train.shape[1], 1), input_shape=(X_train.shape[1],)))
modelLSTM.add(Dropout(0.2))
modelLSTM.add(tf.keras.layers.LSTM(embedding_dims))
modelLSTM.add(Dropout(0.2))
modelLSTM.add(tf.keras.layers.Dense(len(label_dict), activation='softmax'))

modelLSTM.summary()

# Compile the modelLSTM
modelLSTM.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the modelLSTM and save the history of accuracy and loss during training
historymodelLSTM = modelLSTM.fit(X_train, y_train,
                    epochs=5,
                    batch_size=32,
                    validation_data=(X_val, y_val))


In [None]:
modelLSTM.save("/Users/tobi/SavedModels/modelLSTM.keras")

### Basic LSTM with single Layer

In [None]:
loss,accuracy = modelLSTM.evaluate(X_test, y_test)

print("Test Loss:" , loss)
print("Test Accuracy:" , accuracy)

In [None]:
# Plot the training and validation accuracy
plt.plot(historymodelLSTM.history['accuracy'])
plt.plot(historymodelLSTM.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot the training and validation loss
plt.plot(historymodelLSTM.history['loss'])
plt.plot(historymodelLSTM.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

### Comparison of Basic RNN model and LSTM with single layer

In [None]:
plt.plot(historymodelRNN.history['accuracy'], linestyle='solid', color='blue')
plt.plot(historymodelRNN.history['val_accuracy'], linestyle='dotted', color='blue')
plt.plot(historymodelLSTM.history['accuracy'], linestyle='solid', color='orange')
plt.plot(historymodelLSTM.history['val_accuracy'], linestyle='dotted', color='orange')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['RNN Train', 'RNN Val', 'LSTM Train', 'LSTM Val'], loc='upper left')
plt.show()


### LSTM with multiple layers (No Embeddings Layer)

In [None]:
# Define the LSTM model architecture
modelLSTM2 = tf.keras.Sequential()
modelLSTM2.add(tf.keras.layers.Reshape((X_train.shape[1], 1), input_shape=(X_train.shape[1],)))

# Add 2 LSTM layers
modelLSTM2.add(tf.keras.layers.LSTM(embedding_dims, return_sequences=True))
modelLSTM2.add(Dropout(0.2))
modelLSTM2.add(tf.keras.layers.LSTM(embedding_dims))
modelLSTM2.add(Dropout(0.2))

modelLSTM2.add(tf.keras.layers.Dense(len(label_dict), activation='softmax'))

modelLSTM2.summary()

# Compile the modelLSTM2
modelLSTM2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the modelLSTM2 and save the history of accuracy and loss during training
historymodelLSTM2 = modelLSTM2.fit(X_train, y_train,
                    epochs=epochs,
                    batch_size=32,
                    validation_data=(X_val, y_val))



In [None]:
modelLSTM2.save("/Users/tobi/SavedModels/modelLSTM2.keras")

### LSTM with Multiple Layers Evaluation


In [None]:
loss,accuracy = modelLSTM2.evaluate(X_test, y_test)

print("Test Loss:" , loss)
print("Test Accuracy:" , accuracy)

In [None]:
# Plot the training and validation accuracy
plt.plot(historymodelLSTM2.history['accuracy'])
plt.plot(historymodelLSTM2.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot the training and validation loss
plt.plot(historymodelLSTM2.history['loss'])
plt.plot(historymodelLSTM2.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

### Comparison of Single Layer LSTM with Mutliple layer LSTM

In [None]:
plt.plot(historymodelLSTM2.history['accuracy'], linestyle='solid', color='blue')
plt.plot(historymodelLSTM2.history['val_accuracy'], linestyle='dotted', color='blue')
plt.plot(historymodelLSTM.history['accuracy'], linestyle='solid', color='orange')
plt.plot(historymodelLSTM.history['val_accuracy'], linestyle='dotted', color='orange')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['LSTM Multiple Train', 'LSTM Multiple Val', 'LSTM Single Train', 'LSTM Multiple Val'], loc='upper left')
plt.show()


### Embeddings

### I - On the fly Embeddings

In [None]:
# Define the LSTM model architecture with the On the fly Embeddings by adding an embeddings layer
modelOnTheFly = tf.keras.Sequential()
modelOnTheFly.add(Embedding(max_features, embedding_dims, input_length=maxlen))

# Add 2 LSTM layers
modelOnTheFly.add(tf.keras.layers.LSTM(embedding_dims, return_sequences=True))
modelOnTheFly.add(Dropout(0.2))
modelOnTheFly.add(tf.keras.layers.LSTM(embedding_dims))
modelOnTheFly.add(Dropout(0.2))

modelOnTheFly.add(tf.keras.layers.Dense(len(label_dict), activation='softmax'))

modelOnTheFly.summary()

# Compile the modelOnTheFly
modelOnTheFly.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the modelOnTheFly and save the history of accuracy and loss during training
historymodelOnTheFly = modelOnTheFly.fit(X_train, y_train,
                    epochs=epochs,
                    batch_size=32,
                    validation_data=(X_val, y_val))

In [None]:
modelOnTheFly.save("/Users/tobi/SavedModels/modelOnTheFly.keras")

### Evaluation


In [None]:
loss,accuracy = modelOnTheFly.evaluate(X_test, y_test)

print("Test Loss:" , loss)
print("Test Accuracy:" , accuracy)

In [None]:
# Plot the training and validation accuracy
plt.plot(historymodelOnTheFly.history['accuracy'])
plt.plot(historymodelOnTheFly.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot the training and validation loss
plt.plot(historymodelOnTheFly.history['loss'])
plt.plot(historymodelOnTheFly.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

### II - Pre-trained Embeddings


In [None]:
import tensorflow_hub as hub

print("loading embedding")
embed = hub.load("https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1")

In [None]:
# hub_layer = hub.KerasLayer("https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1", output_shape=[20],
#                            input_shape=[], dtype=tf.string)

# modelTrainedEmbeddings = tf.keras.Sequential()
# modelTrainedEmbeddings.add(hub_layer)
# modelTrainedEmbeddings.add(tf.keras.layers.Dense(16, activation='relu'))
# modelTrainedEmbeddings.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

# modelTrainedEmbeddings.summary()


# #modelTrainedEmbeddings.summary()

# # Compile and train the modelTrainedEmbeddings
# modelTrainedEmbeddings.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# history = modelTrainedEmbeddings.fit(x_train.astype(str), target_train, batch_size=32, epochs=5, validation_data=(x_val.astype(str), target_val))

# #Plot the accuracy during training
# acc = history.history['accuracy']
# val_acc = history.history['val_accuracy']
# epochs_range = range(1, epochs+1)
# plt.plot(epochs_range, acc, label='Training Accuracy')
# plt.plot(epochs_range, val_acc, label='Validation Accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend(loc='lower right')
# plt.show()


### Using Bag of Words instead of Embeddings

In [None]:
# Define the bag-of-words model architecture
modelBagOfWords = tf.keras.Sequential()
modelBagOfWords.add(tf.keras.layers.Input(shape=(maxlen,)))
modelBagOfWords.add(tf.keras.layers.Dense(64, activation='relu'))
modelBagOfWords.add(tf.keras.layers.Dropout(0.5))
modelBagOfWords.add(tf.keras.layers.Dense(len(label_dict), activation='softmax'))

modelBagOfWords.summary()

# Compile the modelBagOfWords
modelBagOfWords.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the modelBagOfWords and save the history of accuracy and loss during training
historymodelBagOfWords = modelBagOfWords.fit(X_train, y_train,
                              epochs=epochs,
                              batch_size=32,
                              validation_data=(X_val, y_val))


In [None]:
modelBagOfWords.save("/Users/tobi/SavedModels/modelBagOfWords.keras")

### Evaluation

In [None]:
loss,accuracy = modelBagOfWords.evaluate(X_test, y_test)

print("Test Loss:" , loss)
print("Test Accuracy:" , accuracy)

In [None]:
# Plot the training and validation accuracy
plt.plot(historymodelBagOfWords.history['accuracy'])
plt.plot(historymodelBagOfWords.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot the training and validation loss
plt.plot(historymodelBagOfWords.history['loss'])
plt.plot(historymodelBagOfWords.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

### Comparison of On the Fly Embeddings Model with Bag of Words model

In [None]:
plt.plot(historymodelOnTheFly.history['accuracy'], linestyle='solid', color='blue')
plt.plot(historymodelOnTheFly.history['val_accuracy'], linestyle='dotted', color='blue')
plt.plot(historymodelBagOfWords.history['accuracy'], linestyle='solid', color='orange')
plt.plot(historymodelBagOfWords.history['val_accuracy'], linestyle='dotted', color='orange')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['LSTM Multiple Train', 'LSTM Multiple Val', 'BagOfWords Train', 'BagOfWords Val'], loc='upper left')
plt.show()


### Evaluation


### CNN for Text Classification

### CNNs as an Alternative to an LSTM Solution
An implementation of using a CNN with multiple and heterogeneous kernel sizes as an alternative to an LSTM solution for text classification:

In [None]:

# Define CNN model architecture
modelAltCNN = Sequential()
modelAltCNN.add(Embedding(max_features, embedding_dims, input_length=maxlen))
modelAltCNN.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
modelAltCNN.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
modelAltCNN.add(Dropout(0.2))
modelAltCNN.add(GlobalMaxPooling1D())
modelAltCNN.add(Dense(len(label_dict), activation='softmax'))

# Compile the model
modelAltCNN.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model with validation data
historymodelAltCNN = modelAltCNN.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

In [None]:
modelAltCNN.save("/Users/tobi/SavedModels/modelAltCNN.keras")

### Evaluation

In [None]:
loss,accuracy = modelAltCNN.evaluate(X_test, y_test)

print("Test Loss:" , loss)
print("Test Accuracy:" , accuracy)

In [None]:
# Plot the training and validation accuracy
plt.plot(historymodelAltCNN.history['accuracy'])
plt.plot(historymodelAltCNN.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot the training and validation loss
plt.plot(historymodelAltCNN.history['loss'])
plt.plot(historymodelAltCNN.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

### CNNs as an Additional Layer Before an LSTM Solution
An implementation of using a CNN with multiple and heterogeneous kernel sizes as an additional layer before an LSTM solution for text classification:

In [None]:
# Define combined CNN and LSTM model architecture
modelLSTMCNN = Sequential()
modelLSTMCNN.add(Embedding(max_features, embedding_dims, input_length=maxlen))
modelLSTMCNN.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
modelLSTMCNN.add(Dropout(0.2))
modelLSTMCNN.add(GlobalMaxPooling1D())
modelLSTMCNN.add(Reshape((1, -1)))  
modelLSTMCNN.add(LSTM(embedding_dims))
modelLSTMCNN.add(Dense(len(label_dict), activation='softmax'))

# Compile the model
modelLSTMCNN.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model with validation data
historymodelLSTMCNN = modelLSTMCNN.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))




In [None]:
modelLSTMCNN.save("/Users/tobi/SavedModels/modelLSTMCNN.keras")

### Evaluation


In [None]:
loss,accuracy = modelLSTMCNN.evaluate(X_test, y_test)

print("Test Loss:" , loss)
print("Test Accuracy:" , accuracy)

In [None]:
# Plot the training and validation accuracy
plt.plot(historymodelAltCNN.history['accuracy'])
plt.plot(historymodelAltCNN.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot the training and validation loss
plt.plot(historymodelAltCNN.history['loss'])
plt.plot(historymodelAltCNN.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

### Comparison of CNN as alternative to LSTM and CNN with LSTM models

In [None]:
plt.plot(historymodelAltCNN.history['accuracy'], linestyle='solid', color='blue')
plt.plot(historymodelAltCNN.history['val_accuracy'], linestyle='dotted', color='blue')
plt.plot(historymodelLSTMCNN.history['accuracy'], linestyle='solid', color='orange')
plt.plot(historymodelLSTMCNN.history['val_accuracy'], linestyle='dotted', color='orange')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['CNN as Alternative to LSTM Train', 'CNN as Alternative to LSTMVal', 'CNN before LSTM Train', 'CNN before LSTM Val'], loc='upper left')
plt.show()


### Comparison to Non-Neural Methods

Using a Naive Bayes model as the non neural method, retrain the data with this model and checking its 

In [None]:
from sklearn.naive_bayes import MultinomialNB

n_iterations = 5

train_accs = []
val_accs = []

for i in range(n_iterations):
    # Create a Naive Bayes model
    modelNB = MultinomialNB()

    # Train the model on the training set
    modelNB.fit(X_train,y_train)

    # Evaluate the model on the training set and validation set
    train_acc = modelNB.score(X_train, y_train)
    val_acc = modelNB.score(X_val, y_val)
    
    train_accs.append(train_acc)
    val_accs.append(val_acc)

# Calculate the average accuracies
avg_train_acc = sum(train_accs) / n_iterations
avg_val_acc = sum(val_accs) / n_iterations

print("Average training accuracy:", avg_train_acc)
print("Average validation accuracy:", avg_val_acc)

# Plot the accuracies
plt.plot([avg_train_acc, avg_val_acc], marker='o')
plt.xticks([0, 1], ['Training Accuracy', 'Validation Accuracy'])
plt.ylim([0, 0.2])
plt.title('Naive Bayes Accuracy')
plt.show()


In [None]:
joblib.dump(modelNB, 'naive_bayes_model.pkl')

### Comparion of non-neural method to best performing neural method

In [None]:

# Plot training and validation accuracies for modelAltCNN
plt.plot(historymodelAltCNN.history['accuracy'])
plt.plot(historymodelAltCNN.history['val_accuracy'])

# Plot training and validation accuracies for RandomForestClassifier
plt.plot(train_accs)
plt.plot(val_accs)

plt.title('Accuracy Comparison')
plt.ylabel('Accuracy')
plt.xlabel('Epoch/Iteration')
plt.legend(['CNN as Alternative to LSTM Train Train', 'CNN as Alternative to LSTM Train Validation', 'RF Train', 'RF Validation'], loc='upper left')

plt.show()


### Additional Data

Adding the web title column as an input in conjuction with the body content column into an LSTM model. We must use values of the webtitle to create new training, test and validation sets

First tokenize and pad the webtitles values into a variable

In [None]:
# Tokenizing text and Pad sequences to ensure equal length for web titles
tokenizer_web_title = Tokenizer(num_words=max_features, split=' ')
tokenizer_web_title.fit_on_texts(extract['webTitle'].values)
X_web_title = tokenizer_web_title.texts_to_sequences(extract['webTitle'].values)
X_web_title = pad_sequences(X_web_title)

Splitting a new testing, training and validation sets with different contents. One containing body content values and one containing web title 

In [None]:
# Split the data into training, validation, and testing sets
X_web_title_train, X_web_title_test, X_body_content_train, X_body_content_test, y_train, y_test = train_test_split(
    X_web_title, X, y, test_size=0.2, random_state=42
)
X_web_title_val, X_web_title_test, X_body_content_val, X_body_content_test, y_val, y_test = train_test_split(
    X_web_title_test, X_body_content_test, y_test, test_size=0.5, random_state=42
)

In [None]:
# Preprocess by padding the sequences to the same length 
X_web_title_train = sequence.pad_sequences(X_web_title_train, maxlen=maxlen)
X_body_content_train = sequence.pad_sequences(X_body_content_train, maxlen=maxlen)
X_web_title_val = sequence.pad_sequences(X_web_title_val, maxlen=maxlen)
X_body_content_val = sequence.pad_sequences(X_body_content_val, maxlen=maxlen)
X_web_title_test = sequence.pad_sequences(X_web_title_test, maxlen=maxlen)
X_body_content_test = sequence.pad_sequences(X_body_content_test, maxlen=maxlen)

In [None]:
# Define input layers
input_web_title = Input(shape=(maxlen,), name='input_web_title')
input_body_content = Input(shape=(maxlen,), name='input_body_content')

# Embedding layer for web title input
embedding_web_title = Embedding(max_features, embedding_dims, input_length=maxlen)(input_web_title)
# Embedding layer for body content input
embedding_body_content = Embedding(max_features, embedding_dims, input_length=maxlen)(input_body_content)

In [None]:
# LSTM layer for web title input
lstm_web_title = LSTM(embedding_dims)(embedding_web_title)
dropout_lstm_web_title = Dropout(0.2)(lstm_web_title)

# LSTM layer for body content input
lstm_body_content = LSTM(embedding_dims)(embedding_body_content)
lstm_body_content_dropout = Dropout(0.2)(lstm_body_content)

# Concatenate the outputs of the LSTM layers
merged = Concatenate()([lstm_web_title, lstm_body_content])

# Dense layer for prediction
output = Dense(len(label_dict), activation='softmax')(merged)

# Create the modelTwoInputs with multiple inputs
modelTwoInputs = Model(inputs=[input_web_title, input_body_content], outputs=output)

# Compile the modelTwoInputs
modelTwoInputs.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the modelTwoInputs with validation data
historymodelTwoInputs = modelTwoInputs.fit(
    [X_web_title_train, X_body_content_train],  # Input data for both web title and body content
    y_train,
    epochs=5,
    batch_size=32,
    validation_data=([X_web_title_val, X_body_content_val], y_val) 
)

In [None]:
modelTwoInputs.save("/Users/tobi/SavedModels/modelTwoInputs.keras")

### Evaluation

In [None]:
# Evaluate the model with test data
test_loss, test_acc = modelTwoInputs.evaluate([X_web_title_test, X_body_content_test], y_test, verbose=2)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)


In [None]:
# Plot training and validation accuracy curves
plt.plot(historymodelTwoInputs.history['accuracy'])
plt.plot(historymodelTwoInputs.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()


### Comparison of Two Input Model with best performing model

In [None]:
plt.plot(historymodelAltCNN.history['accuracy'], linestyle='solid', color='blue')
plt.plot(historymodelAltCNN.history['val_accuracy'], linestyle='dotted', color='blue')
plt.plot(historymodelTwoInputs.history['accuracy'], linestyle='solid', color='orange')
plt.plot(historymodelTwoInputs.history['val_accuracy'], linestyle='dotted', color='orange')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['CNN as Alternative to LSTM Train', 'CNN as Alternative to LSTMVal', 'Two Inputs Model Train', 'Two Inputs Model Val'], loc='upper left')
plt.show()
