In [None]:
!pip install transformers

import pandas as pd 
import numpy as np
import regex as re
import tensorflow as tf

from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalMaxPool1D, Embedding, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy

from transformers import AutoTokenizer,TFBertModel

from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

In [143]:
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalMaxPool1D, Embedding, LSTM, SpatialDropout1D, Bidirectional, Reshape, TimeDistributedDense

ImportError: ignored

# Loading dataset

In [None]:
df = pd.read_csv ('gdrive/MyDrive/Colab Notebooks/Sentiment Analysis/tweet_emotions.csv') 

# Cleaning the entire dataset



In [None]:
df.drop('tweet_id', inplace = True, axis = 1)                 # removing tweet_id column
df.drop(df[(df['sentiment'] == 'empty')].index, inplace=True) # removing empty sentiment rows

df.rename(columns = {'content':'sentence'}, inplace = True)   # renaming 'content' column to 'sentence'

df.replace(to_replace = ['fun', 'happiness'], value = "party", inplace = True)               # fun, happiness <-> party
df.replace(to_replace = ['enthusiasm', 'surprise', 'love'], value = "happy", inplace = True) # enthusiasm, surprise, love <-> happy
df.replace(to_replace = ['sadness', 'anger', 'hate'], value = "sad", inplace = True)         # sadness, anger <-> sad
df.replace(to_replace = ['boredom', 'worry'], value = "chill", inplace = True)               # boredom, worry <-> chill
df.replace(to_replace = ['neutral', 'relief'], value = "normal", inplace = True)             # neutral, relief <-> normal

df.dropna(inplace=True) # removing empty values

# Cleaning the sentences

In [None]:
sentences = df.sentence.tolist()

for i in range(len(sentences)):
  sentences[i] = sentences[i].lower()
  sentences[i] = re.sub(r'http\S+', "", sentences[i])
  sentences[i] = re.sub(r"\@\w+", "", sentences[i])
  sentences[i] = re.sub(r"\#\w+", "", sentences[i])
  sentences[i] = re.sub(r'[^\w]', ' ', sentences[i])
  sentences[i] = re.sub(r"\............\w+", "", sentences[i])
  sentences[i] = re.sub(' +', ' ', sentences[i])
  sentences[i] = sentences[i].strip()

# Splitting Train and Test Sentences

In [None]:
train_sentences, test_sentences = train_test_split(sentences, test_size=0.2, shuffle=False) 

# Encoding Sentiments
Making a dictionary of all the sentiments in the dataset \
Then mapping all the sentiments to the dictionary values

In [None]:
sentiments_dict = {'party':0, 'happy':1, 'sad':2, 'chill':3, 'normal':4} # making sentiment dictionary
mapped_sentiments = df.sentiment.map(sentiments_dict) # mapping sentiments

# Splitting Train and Test Sentiment 

Converting sentiments dictionary to numpy array to split train and test data \
Encoding sentiments to vectors\
Splitting train and test sentiments

In [None]:
np_mapped_sentiments = mapped_sentiments.to_numpy() # converting sentiments to np array
encoded_mapped_sentiments = to_categorical(mapped_sentiments) # encoding sentiments 

mapped_train_sentiments, mapped_test_sentiments = train_test_split(np_mapped_sentiments, test_size=0.2, shuffle = False) 
encoded_train_sentiments, encoded_test_sentiments = train_test_split(encoded_mapped_sentiments, test_size=0.2, shuffle = False) # splitting train and test sentiments

# Tokenizing, Padding, Encoding, Embedding Sentences

Using the pretrained Bert model called 'bert-base-cased' to preprocess the data \
https://huggingface.co/bert-base-cased

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

In [None]:
ls = [len(sentence.split()) for sentence in train_sentences] 
max_len = int(np.percentile(ls, 98)) # finding the 98 percentile highest sentence length of the dataset

x_train = tokenizer(
    text= train_sentences,
    add_special_tokens = True,
    max_length = max_len,
    truncation = True,
    padding = True, 
    return_tensors = 'tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

x_test = tokenizer(
    text = test_sentences,
    add_special_tokens = True,
    max_length = max_len,
    truncation = True,
    padding = True, 
    return_tensors = 'tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

# Checkpoints

Adding checkpoint paths to store save the weights of trained model after each epoch

In [None]:
filepath="/content/gdrive/MyDrive/Colab Notebooks/Sentiment Analysis/checkpoints/cp1.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, mode='max')
callbacks_list = [checkpoint]

# Making the model

Making my model by adding layers to the neural network


In [79]:
def create_model():
  # Input Layers
  input_ids_layer = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")       # input ids layer
  input_mask_layer = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask") # attendtion mask layer
  embeddings_layer = bert(input_ids_layer, attention_mask = input_mask_layer)[0]    # embeddings layer

  # Hidden Layers
  hidden_layer_1 = SpatialDropout1D()(embeddings_layer) 
  hidden_layer_2 = Dense(64, activation='relu')(hidden_layer_1)
  hidden_layer_3 = Dropout(0.5)(hidden_layer_2)
  hidden_layer_4 = Dense(16,activation = 'relu')(hidden_layer_3)
  hidden_layer_5 = Dropout(0.5)(hidden_layer_4)
  hidden_layer_6 = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(hidden_layer_5)

  # Output Layer
  output_layer = Dense(5,activation = 'softmax')(hidden_layer_6) # output layer

  # Models
  model = tf.keras.Model(inputs=[input_ids_layer, input_mask_layer], outputs=output_layer) # model
  model.layers[2].trainable = True

  return model

In [166]:
# def create_model():
#   # Input Layers
#   input_ids_layer = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")       # input ids layer
#   input_mask_layer = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask") # attendtion mask layer
#   l = bert(input_ids_layer, attention_mask = input_mask_layer)[0]    # embeddings layer
#   print(l.shape)
#   # Hidden Layers
#   #l = GlobalMaxPool1D()(l) 
#   #print(type(l))
#   # l = SpatialDropout1D(0.4)(l)
#   l = Dense(64, activation='relu')(l)
#   l = Dropout(0.5)(l)
#   l = Dense(16,activation = 'relu')(l)
#   l = Dropout(0.5)(l) 
#   #l = Reshape((None,1, 16), input_shape=(None,16))
#   print(type(l))
#   # l = TimeDistributedDense(10,input_dim=10)
#   # l = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(l)
#   # l = LSTM(32, dropout=0.2, recurrent_dropout=0.2, input_shape=(None,16))(l)
#   # l = Bidirectional(LSTM(32))(l)

#   # Output Layer
#   output_layer = Dense(5,activation = 'softmax')(l) # output layer

#   # Models
#   model = tf.keras.Model(inputs=[input_ids_layer, input_mask_layer], outputs=output_layer) # model
#   model.layers[2].trainable = True

#   return model


# model = create_model()
# model = compile_model(model)
# model.summary()

(None, 28, 768)
<class 'keras.engine.keras_tensor.KerasTensor'>
Model: "model_15"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 28)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 28)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                           

# Compiling the model
Compiling the model using:
- Adam optimizer (experimented with the hyperparameters to find the best one)
- Categorical Cross Entropy (I'm doing multi-class sentiment analysis)
- Categorical Accuracy (using balanced accuracy to see the accuracy of all the classes)


In [84]:
def compile_model(model):
  optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0
  )
  
  loss = CategoricalCrossentropy(from_logits = True)
  metric = CategoricalAccuracy('balanced_accuracy'),

  model.compile(
      optimizer = optimizer,
      loss = loss, 
      metrics = metric
  )

  return model

# Training the model

In [85]:
def train_model(model):
  model.fit(
      x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']}, 
      y = encoded_train_sentiments,
      validation_data = ({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']}, encoded_test_sentiments),
      epochs= 7,
      batch_size= 36,
      callbacks = callbacks_list
  )

  return model

In [86]:
model = create_model()
model = compile_model(model)
model = train_model(model)

ValueError: ignored

# Loading checkpoints

In [None]:
model = create_model()
model.load_weights("/content/gdrive/MyDrive/Colab Notebooks/Sentiment Analysis/checkpoints/cp1.hdf5")
model = compile_model(model)

In [None]:
filepath="/content/gdrive/MyDrive/Colab Notebooks/Sentiment Analysis/checkpoints/cp2.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, mode='max')
callbacks_list = [checkpoint]
model = train_model(model)
model = create_model()
model.load_weights("/content/gdrive/MyDrive/Colab Notebooks/Sentiment Analysis/checkpoints/cp2.hdf5")
model = compile_model(model)

# Saving Model

In [None]:
model.save('gdrive/MyDrive/Colab Notebooks/Sentiment Analysis/mymodel/model.h5')

# Predicting the mood

In [None]:
def get_mood():
  threshold_pred = 67.8 # value calculated after running the program
  texts = input(str('Hey! How are you today? '))

  x_val = tokenizer(
      text=texts,
      add_special_tokens=True,
      max_length=max_len,
      truncation=True,
      padding='max_length', 
      return_tensors='tf',
      return_token_type_ids = False,
      return_attention_mask = True,
      verbose = True
      ) 

  validation = model.predict({'input_ids':x_val['input_ids'],'attention_mask':x_val['attention_mask']}) * 100
  predicted_sentiments = dict()

  for key, value in zip(sentiments_dict.keys(), validation[0]):
      predicted_sentiments[key] = value

  predicted_mood = max(zip(predicted_sentiments.values(), predicted_sentiments.keys()))[1]
  predicted_mood_value = max(zip(predicted_sentiments.values(), predicted_sentiments.keys()))[0]

  if predicted_mood_value < threshold_pred:
    return "I'm sorry! I'm not sure how you feel"

  return predicted_mood


In [None]:
get_mood()