In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import requests
import zipfile
import io
import glob
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn import svm
from keras.models import load_model
from keras.layers import LSTM
from tensorflow.keras.layers import Input,Dense, Dropout, Activation, Flatten, Embedding, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, LSTM, SimpleRNN, Reshape
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from keras.layers import Concatenate
from keras.utils import np_utils
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from keras.utils import to_categorical




## Part 2 -  BBC Dataset

In [79]:
# Download the zip file from a URL
url = 'https://storage.googleapis.com/kaggle-data-sets/30569/38997/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20230508%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20230508T172143Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=4e609a86d435ed376361a76cdd011b242c015e86fc2d0a8f415fecbb53a0ee3ef71e4f0b12222c0f4952572db08e96f6588a430a1acae8924a9a484181337f2013e3a68dfc73e0e1f96ad153c62d25b36d40b33992033027fde63c06e2b1c6291fc699b1817c83fd0c5ca52745b15247b5389b6030db1a51c9183cf5cdd4e41aeb78182b7ff83ff06a494c03ba09243b87a1aeda606bb1928e772f6c420ea6c7d1fdc54714ef2b07cf849fa533c9ef797584d44a05d036877966c99a35a48052e7a46877b17943f692a74fb3e82bd7c20fcef9ccdda38481291835c58668beaf481776dca8b1bc2921bef4567b11b4618213bff468e8884647638b48accb7f7d'
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))

# Extract the files to a directory
z.extractall('datasets')

In [81]:
# Load the dataset into a variable called "data"
data = pd.read_csv('datasets/bbc-text.csv')
data.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


### Load Models

In [83]:
# Download the zip file from a URL
url = 'https://storage.googleapis.com/kaggle-data-sets/3240578/5636667/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20230508%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20230508T205651Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=7fb26ad2f78ece1c96f8155b1488554f60ff075d92c41361a28343cdb228d7e19ec31d512251967cde6488323aebbf4bb8b8a791dc46f33d2cc42c6cfb399465174c27a74d3719683d9643acda27157f9aa10aacd14170889eba102e4aa05237f41201934a10944007068264e9174611d0f9d9f5dd47cb4a1e02280e3f2b1c25b4be7746aee228ba0ee5de09bc0868dce80fa8b295cc637fa24e4906a87a904b4e1300e723467593727012201fd3625a30a8837418a57c089e56c128017247dac8c71bd0187da3a700988210d6d659538ac3ffef20dccd3bb5a270479056f32910c2ee319a3eaf4f9ce2bdb2be3f08d17ab0a146d863b1653db32aebee0b2786'
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))

# Extract the files to a directory
z.extractall('models')

In [87]:
model_names = []
for file_path in glob.glob("models/*.keras"):
    model_names.append(file_path.split("/")[-1].split(".keras")[0])

print(model_names)

['modelLSTM2', 'modelRNN', 'modelTwoInputs', 'modelTransfserLSTM2', 'modelGenerate', 'modelAltCNNScratch', 'modelAltCNN', 'modelBagOfWords', 'modelLSTM', 'modelOnTheFly', 'modelTransfserLSTM', 'modelLSTMCNN']


### Preprocess Text

In [None]:
def preprocess_text(text):

    # Convert the text to a string
    text = str(text)
    
    # Convert text to lowercase
    text = text.lower()

    # Remove unwanted characters using regular expressions
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the words back into a single string
    preprocessed_text = ' '.join(words)

    return preprocessed_text


# Preprocess the text extract in the extractframe
data['text'] = data['text'].apply(preprocess_text)
data['category'] = data['category'].apply(preprocess_text)

### Splitting the Truncated dataset into Training, Validation and Test sets

In [None]:
# Tokenizing text and Pad sequences to ensure equal length
max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X) 

In [None]:
# Create a dictionary to map the labels to numerical values
label_dict = {label: index for index, label in enumerate(data['category'].unique())}
y = [label_dict[label] for label in data['category']]
y = np.array(y)

In [None]:
# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
# Hyperparameters that are used to define the architecture and settings of the RNN model
max_features = 5000  #maximum number of words to keep based on word frequency
maxlen = 400 #maximum number of words in a single sentence.
embedding_dims = 16 #dimensionality of the output space
epochs = 5 #iterations

# Preprocess by padding the sequences to the same length 
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
X_val = sequence.pad_sequences(X_val, maxlen=maxlen)

print('Train shape:', X_train.shape)
print('Test shape:', X_test.shape)
print('Validation shape:', X_val.shape)

### Model Variant from Scratch (CNN model) - No transfer Learning



In [None]:
# Define CNN model architecture
modelAltCNNScratch = Sequential()
modelAltCNNScratch.add(Embedding(max_features, embedding_dims, input_length=maxlen))
modelAltCNNScratch.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
modelAltCNNScratch.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
modelAltCNNScratch.add(Dropout(0.2))
modelAltCNNScratch.add(GlobalMaxPooling1D())
modelAltCNNScratch.add(Dense(len(label_dict), activation='softmax'))

# Compile the model
modelAltCNNScratch.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model with validation data
historymodelAltCNNScratch = modelAltCNNScratch.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

In [None]:
modelAltCNNScratch.save("/Users/tobi/SavedModels/modelAltCNNScratch.keras")

### Evaluate model 

In [None]:
loss,accuracy = modelAltCNNScratch.evaluate(X_test, y_test)

print("Test Loss:" , loss)
print("Test Accuracy:" , accuracy)

In [None]:
# Plot the training and validation accuracy
plt.plot(historymodelAltCNNScratch.history['accuracy'])
plt.plot(historymodelAltCNNScratch.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot the training and validation loss
plt.plot(historymodelAltCNNScratch.history['loss'])
plt.plot(historymodelAltCNNScratch.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

### Transfer Learning (I) - New CNN model with input from previous CNN model created in Part 1 

In [88]:
pretrained_cnn_alt_model = load_model('models/modelAltCNN.keras')
pretrained_cnn_alt_model.summary()

Model: "sequential_34"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_26 (Embedding)    (None, 400, 16)           80000     
                                                                 
 conv1d_29 (Conv1D)          (None, 398, 64)           3136      
                                                                 
 conv1d_30 (Conv1D)          (None, 395, 64)           16448     
                                                                 
 dropout_47 (Dropout)        (None, 395, 64)           0         
                                                                 
 global_max_pooling1d_14 (Gl  (None, 64)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_36 (Dense)            (None, 87)                5655      
                                                     

In [None]:
transfer_maxlen = pretrained_cnn_alt_model.input_shape[1]
print(transfer_maxlen)
num_filters_layer1 = pretrained_cnn_alt_model.layers[1].filters
num_filters_layer2 = pretrained_cnn_alt_model.layers[2].filters
num_filters_layer1
num_filters_layer2

In [None]:
# Define LSTM model architecture
modelTransfserLSTM = Sequential()
modelTransfserLSTM.add(pretrained_cnn_alt_model)
modelTransfserLSTM.add(Reshape((1, -1)))
modelTransfserLSTM.add(LSTM(embedding_dims, dropout=0.2, recurrent_dropout=0.2))
modelTransfserLSTM.add(Dense(len(label_dict), activation='softmax'))

# Freeze the weights of the pretrained CNN layers
for layer in modelTransfserLSTM.layers[0].layers:
    layer.trainable = False

# Compile the model
modelTransfserLSTM.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model with validation data
historymodelTransfserLSTM = modelTransfserLSTM.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))


In [None]:
modelTransfserLSTM.save("/Users/tobi/SavedModels/modelTransfserLSTM.keras")

### Evaluation

In [None]:
loss,accuracy = modelTransfserLSTM.evaluate(X_test, y_test)

print("Test Loss:" , loss)
print("Test Accuracy:" , accuracy)

In [None]:
# Plot the training and validation accuracy
plt.plot(historytransferModel1.history['accuracy'])
plt.plot(historytransferModel1.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot the training and validation loss
plt.plot(historytransferModel1.history['loss'])
plt.plot(historytransferModel1.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

### 

### Transfer Learning (II) - New CNN model with input from another CNN model created in Part 1

In [89]:
pretrained_modelLSTMCNN_model = load_model('models/modelLSTMCNN.keras')
pretrained_modelLSTMCNN_model.summary()

Model: "sequential_35"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_27 (Embedding)    (None, 400, 16)           80000     
                                                                 
 conv1d_31 (Conv1D)          (None, 397, 64)           4160      
                                                                 
 dropout_48 (Dropout)        (None, 397, 64)           0         
                                                                 
 global_max_pooling1d_15 (Gl  (None, 64)               0         
 obalMaxPooling1D)                                               
                                                                 
 reshape_12 (Reshape)        (None, 1, 64)             0         
                                                                 
 lstm_23 (LSTM)              (None, 16)                5184      
                                                     

In [None]:
# Define LSTM model architecture
modelTransferLSTM2 = Sequential()
modelTransferLSTM2.add(pretrained_modelLSTMCNN_model)
modelTransferLSTM2.add(Reshape((1, -1)))
modelTransferLSTM2.add(LSTM(embedding_dims, dropout=0.2, recurrent_dropout=0.2))
modelTransferLSTM2.add(Dense(len(label_dict), activation='softmax'))

# Freeze the weights of the pretrained CNN layers
for layer in modelTransferLSTM2.layers[0].layers:
    layer.trainable = False

# Compile the model
modelTransferLSTM2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model with validation data
historymodelTransferLSTM2 = modelTransferLSTM2.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))


In [None]:
modelTransferLSTM2.save("/Users/tobi/SavedModels/modelTransfserLSTM2.keras")

### Evaluation

In [None]:
loss,accuracy = modelTransferLSTM2.evaluate(X_test, y_test)

print("Test Loss:" , loss)
print("Test Accuracy:" , accuracy)

In [None]:
# Plot the training and validation accuracy
plt.plot(historymodelTransferLSTM2.history['accuracy'])
plt.plot(historymodelTransferLSTM2.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot the training and validation loss
plt.plot(historymodelTransferLSTM2.history['loss'])
plt.plot(historymodelTransferLSTM2.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

### Model Comparisons 

### Comparison of Model Variant with no transfer learning to New CNN model with transfer learning I

In [None]:
plt.plot(historymodelAltCNNScratch.history['accuracy'], linestyle='solid', color='blue')
plt.plot(historymodelAltCNNScratch.history['val_accuracy'], linestyle='dotted', color='blue')
plt.plot(historymodelTransfserLSTM.history['accuracy'], linestyle='solid', color='orange')
plt.plot(historymodelTransfserLSTM.history['val_accuracy'], linestyle='dotted', color='orange')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Model Variant with no transfer learning Train', 'Model Variant with no transfer learning Val', 'New CNN model with transfer learning I Train', 'New CNN model with transfer learning IVal'], loc='upper left')
plt.show()


### Comparison of Model Variant with no transfer learning to New CNN model with transfer learning II

In [None]:
plt.plot(historymodelAltCNNScratch.history['accuracy'], linestyle='solid', color='blue')
plt.plot(historymodelAltCNNScratch.history['val_accuracy'], linestyle='dotted', color='blue')
plt.plot(historymodelTransferLSTM2.history['accuracy'], linestyle='solid', color='orange')
plt.plot(historymodelTransferLSTM2.history['val_accuracy'], linestyle='dotted', color='orange')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Model Variant with no transfer learning Train', 'Model Variant with no transfer learning Val', 'New CNN model with transfer learning II Train', 'New CNN model with transfer learning II Val'], loc='upper left')
plt.show()


### Comparison of Transfer Learning Models 

In [None]:
plt.plot(historymodelTransfserLSTM.history['accuracy'], linestyle='solid', color='blue')
plt.plot(historymodelTransfserLSTM.history['val_accuracy'], linestyle='dotted', color='blue')
plt.plot(historymodelTransferLSTM2.history['accuracy'], linestyle='solid', color='orange')
plt.plot(historymodelTransferLSTM2.history['val_accuracy'], linestyle='dotted', color='orange')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Model Variant with no transfer learning Train', 'Model Variant with no transfer learning Val', 'New CNN model with transfer learning I Train', 'New CNN model with transfer learning IVal'], loc='upper left')
plt.show()
