In [1]:
import tensorboard
import tensorflow as tf
import keras 
import tensorflow_hub as hub
import tensorflow_text
import pandas as pd

In [2]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt
import numpy as np

In [3]:
data = pd.read_csv("./Data/mindsetdsV2.csv")

In [4]:
# data preprocessing

#remove the URLS
data['sentence'] = data['sentence'].apply(lambda x: ' '.join([word for word in str(x).split() if word[0:4] not in ('http')]))
#remove the words with @ and # from the text
data['sentence'] = data['sentence'].apply(lambda x: ' '.join([word for word in str(x).split() if word[0] not in ('@', '#')]))
data['sentence'] = data['sentence'].apply(lambda x: str(x).lower())
data['sentence'] = data['sentence'].apply(lambda x: x.replace('\n', ' '))
data['sentence'] = data['sentence'].apply(lambda x: x.replace('\t', ' '))
data['sentence'] = data['sentence'].apply(lambda x: x.replace('\r', ' '))
data['sentence'] = data['sentence'].apply(lambda x: x.replace('\xa0', ' '))
data['sentence'] = data['sentence'].apply(lambda x: x.replace('\u200b', ' '))
data['sentence'] = data['sentence'].apply(lambda x: x.replace('\ufeff', ' '))

stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are',  'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'could',  'did', 'do', 'does', 'doing', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadnt', 'has', 'have', 'havent', 'having', 'he', 'hed', 'hell', 'hes', 'her', 'here', 'heres', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'hows', 'i', 'id', 'ill', 'im', 'ive', 'if', 'in', 'into', 'is', 'isnt', 'it', 'its', 'its', 'itself', 'lets', 'me', 'more', 'most', 'mustnt', 'my', 'myself', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'she', 'shed', 'shell', 'shes', 'should', 'so', 'some', 'such', 'than', 'that', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'theres', 'these', 'they', 'theyd', 'theyll', 'theyre', 'theyve', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'we', 'wed', 'well', 'were', 'weve', 'were', 'what', 'whats', 'when', 'whens', 'where', 'wheres', 
        'which', 'while', 'who', 'whos', 'whom', 'why', 'whys', 'with', 'would', 'you', 'youd', 'youll', 'youre', 'youve', 'your', 'yours', 'yourself', 'yourselves']
#removing stop words
data['sentence'] = data['sentence'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop_words)]))



#remove punchations
import string
punctuations = string.punctuation
data['sentence'] = data['sentence'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (punctuations)]))

In [5]:
#label encoding
#if positive label = 2
#if neutral label = 1
#if negative label = 0

data['label'] = data['label'].apply(lambda x: 2 if x == 'Positive' else x)
data['label'] = data['label'].apply(lambda x: 1 if x == 'Neutral' else x)
data['label'] = data['label'].apply(lambda x: 0 if x == 'Negative' else x)


In [6]:
data['label'].unique()

array([2, 1, 0], dtype=int64)

In [7]:
#split the dataset into train, validation and test
train_dataset = data.sample(frac=0.8, random_state=423)
test_dataset = data.drop(train_dataset.index)
validation_dataset = test_dataset.sample(frac=0.5, random_state=423)
test_dataset = test_dataset.drop(validation_dataset.index)

In [8]:
train_ds = tf.data.Dataset.from_tensor_slices((train_dataset['sentence'], train_dataset['label']))

In [9]:
val_ds = tf.data.Dataset.from_tensor_slices((validation_dataset['sentence'], validation_dataset['label']))

In [10]:
#display the first 5 rows of the train dataset
train_dataset.head()

Unnamed: 0,label,sentence
47831,0,
34002,0,collateral revolver satisfying youtu.be/_quo1_...
39988,0,highest disrespect.
29996,1,business pulling wrong erp system?. upgrade mi...
57466,0,hey guys. tom clancy ’ s ghost recon wild land...


In [11]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 16
seed = 43

train_ds = train_ds.shuffle(buffer_size=len(data)).batch(batch_size)
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = val_ds.shuffle(buffer_size=len(val_ds)).batch(batch_size)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [12]:
tfhub_encoder = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
tfhub_prepocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"


In [13]:
from tensorflow.keras.optimizers import Adam

In [None]:
def create_mindset_evaluation_model():
    text_input = keras.Input(shape=(), dtype='string', name='text_input')
    preprocessing_layer = hub.KerasLayer(tfhub_prepocess, name='Preprocessing_for_BERT')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_encoder, trainable=False, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    x = keras.layers.Dense(128, activation='sigmoid')(outputs['sequence_output'])
    x = keras.layers.Dropout(0.2)(x)
    x = keras.layers.Dense(64, activation='sigmoid')(x)
    x = keras.layers.Dropout(0.2)(x)
    x = keras.layers.Dense(32, activation='sigmoid')(x)
    x = keras.layers.Dropout(0.25)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(3, activation='softmax')(x)
    model = keras.Model(inputs=[text_input], outputs=x)
    model.compile(optimizer=Adam(lr=0.0001),
                    loss='sparse_categorical_crossentropy',             
                    metrics=['accuracy'])
    return model

In [14]:
# create the deep learning model to classify text into three categories
text_input = keras.Input(shape=(), dtype='string', name='text_input')
preprocessing_layer = hub.KerasLayer(tfhub_prepocess, name='Preprocessing_for_BERT')
encoder_inputs = preprocessing_layer(text_input)
encoder = hub.KerasLayer(tfhub_encoder, trainable=False, name='BERT_encoder')
outputs = encoder(encoder_inputs)
x = keras.layers.Dense(128, activation='relu')(outputs['sequence_output'])
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(64, activation='relu')(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(32, activation='relu')(x)
x = keras.layers.Dropout(0.25)(x)
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(3, activation='softmax')(x)
model = keras.Model(inputs=[text_input], outputs=x)
model.compile(optimizer=Adam(lr=0.0001),
                loss='sparse_categorical_crossentropy',             
                metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text_input (InputLayer)        [(None,)]            0           []                               
                                                                                                  
 Preprocessing_for_BERT (KerasL  {'input_type_ids':   0          ['text_input[0][0]']             
 ayer)                          (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

  super(Adam, self).__init__(name, **kwargs)


In [15]:
from tensorflow.keras.callbacks import TensorBoard
import time

# logs about the training process
# accuracy and loss are plotted in the TensorBoard
log_dir = "logs/fit/" + str(int(time.time()))
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [16]:
epochs = 100
history = model.fit(train_ds,
                    validation_data=val_ds,
                    epochs=epochs,
                    verbose=1,
                    callbacks=[
                        keras.callbacks.TensorBoard("logs/fit/" + str(int(time.time())),
                                                    histogram_freq=1,
                                                    write_graph=True,
                                                    write_images=True),
                        keras.callbacks.EarlyStopping(patience=10,
                                                      monitor='val_loss')

                    ])


Epoch 1/50
Epoch 2/50
Epoch 3/50
 600/3085 [====>.........................] - ETA: 12:39 - loss: 0.7426 - accuracy: 0.6811

In [None]:
test_ds = tf.data.Dataset.from_tensor_slices((test_dataset['sentence'], test_dataset['label']))

In [None]:
# create the test dataset
test_ds = test_ds.shuffle(buffer_size=len(test_ds)).batch(batch_size)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)


# evaluate the model
results = model.evaluate(test_ds, verbose=1, callbacks=[
    keras.callbacks.TensorBoard(log_dir='./logs',
                                histogram_freq=1,
                                write_graph=True,
                                write_images=True)]
)


# print the results
print('Test loss:', results[0])
print('Test accuracy:', results[1])

In [None]:
#draw the accuracy and loss curves
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower right')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='upper right')
plt.ylim([0, 1])
plt.show()



In [4]:
# Test Model with Sample Test Data
text = 'Personally I am not go in that way. It is a stress for me. '


# get prdiction from the model
prediction = model.predict([text])

#get the label from the prediction
label = np.argmax(prediction)

# print the label
print(label)

0


In [None]:
#reduce the test_dataset by half randomly
t2dataset = test_dataset.sample(frac=1, random_state=seed)

In [None]:
len(test_dataset)

In [None]:
#confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

# get the predictions
predictions = model.predict(t2dataset['sentence'])

# get the labels
y_pred = np.argmax(predictions, axis=1)

In [None]:
train_dataset['label'].unique()

In [None]:
#model name with date and time
model_name = 'model_' + str(int(time.time()))

# save the model
model.save(model_name + '.h5')

In [None]:
model_name = 'model_1665444162'

In [None]:
# get the confusion matrix for three classes
from operator import index
from re import M


cm = confusion_matrix(t2dataset['label'], y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels=['negative', 'neutral', 'positive'], yticklabels=['negative', 'neutral', 'positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# create directory
import os
try:
    os.mkdir("./Model Evaluation/" + model_name)
except:
    pass

#save confusion matrix
plt.savefig('./Model Evaluation/' + model_name + '/confusion_matrix.png')

#calculate the accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(t2dataset['label'], y_pred)
print('Accuracy: %f' % accuracy)

#f1 score
from sklearn.metrics import f1_score
f1 = f1_score(t2dataset['label'], y_pred, average='macro')
print('F1 score: %f' % f1)

#precision
from sklearn.metrics import precision_score
precision = precision_score(t2dataset['label'], y_pred, average='macro')
print('Precision: %f' % precision)

#recall
from sklearn.metrics import recall_score
recall = recall_score(t2dataset['label'], y_pred, average='macro')
print('Recall: %f' % recall)


# save matrix into a txt file
with open('./Model Evaluation/' + model_name + '/matrix.txt', 'w') as f:
    f.write('Accuracy: %f \n' % accuracy)
    f.write('F1 score: %f \n' % f1)
    f.write('Precision: %f \n' % precision)
    f.write('Recall: %f \n' % recall)





In [None]:
#Error Analysis

#Root Mean Squared Error
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(t2dataset['label'], y_pred))
print('Root Mean Squared Error: %f' % rms)

#Mean Absolute Error
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(t2dataset['label'], y_pred)

print('Mean Absolute Error: %f' % mae)

#Mean Squared Error
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(t2dataset['label'], y_pred)

print('Mean Squared Error: %f' % mse)


In [None]:
prediction = np.argmax(prediction, axis=1)

In [None]:
#save the model
model.save('./Models/' + model_name)

In [3]:
# load the model
model = keras.models.load_model('./Models/' + 'model_1665444162')

In [None]:
#plot the model
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='./Model Output/' + model_name + '.png', show_shapes=True, show_layer_names=True)

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()


plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()

#save the plots
plt.savefig('./Model Evaluation/' + model_name + '/accuracy.png')
plt.savefig('./Model Evaluation/' + model_name + '/loss.png')