# Loading the data

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

from simpletransformers.language_representation import RepresentationModel
from tensorflow.keras.utils import to_categorical

from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


# Cleaning the data

In [None]:
df = pd.read_csv ('gdrive/MyDrive/Colab Notebooks/Sentiment Analysis/tweet_emotions.csv') # loading the dataset
df.drop('tweet_id', inplace = True, axis = 1) # removing tweet_id column
df.dropna(inplace=True) # removing empty values
df.rename(columns = {'content':'sentence'}, inplace = True)

# CSV -> Sentence

In [None]:
cols_as_np = df[df.columns[0:]].to_numpy() # converting pd into np
 
emotions = [data[0] for data in cols_as_np] # extracing sentences
sentences = [data[1] for data in cols_as_np] # extracting emotions

# Splitting train, test, val data

In [None]:
train_sentences, test_sentences, train_emotions, test_emotions = train_test_split(sentences, emotions, test_size=0.2, random_state=1)
train_sentences, val_sentences, train_emotions, val_emotions = train_test_split(train_sentences, train_emotions, test_size=0.25, random_state=1) 

# Emotions -> Categorical


In [None]:
emotions = np.unique(emotions) #getting the unique intents

encoded_emotions_dict = {'anger':0, 'boredom':1, 'empty':2, 'enthusiasm':3, 'fun':4, 'happiness':5,
                'hate':6, 'love':7, 'neutral':8, 'relief':9, 'sadness':10, 'surprise':11, 'worry':12}

for index, emotion in enumerate(train_emotions):
  if emotion in encoded_emotions_dict:
    train_emotions[index] = encoded_emotions_dict[emotion]

for index, emotion in enumerate(test_emotions):
  if emotion in encoded_emotions_dict:
    test_emotions[index] = encoded_emotions_dict[emotion]

for index, emotion in enumerate(val_emotions):
  if emotion in encoded_emotions_dict:
    val_emotions[index] = encoded_emotions_dict[emotion]

categorical_train_emotions = to_categorical(train_emotions) # y_train
categorical_test_emotions = to_categorical(test_emotions) # y_test
categorical_val_emotions = to_categorical(val_emotions)

# Tokenizing, Padding & Dictionary

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased", do_lower_case=True, pad_token="[PAD]") # tokenizing each sentennce

for sentence in train_sentences:
  token = tokenizer.tokenize(sentence) # tokenizing each word in the sentence
  token = ['[CLS]'] + token + ['[SEP]']
  tokenizer.convert_tokens_to_ids(token) # giving id numbers to each word
  print(token)
  break

dictionary = tokenizer.vocab


['[CLS]', '@', 's', '##g', '##beat', ':', 'k', '##wang', '##ho', '##ck', '@', 'v', '##alk', '##yr', '##ies', '##life', '@', 'ka', '##hh', '##ong', '##ta', '##y', '@', 'j', '##ere', '##my', '##8', '##9', '##6', '##32', 'it', 'is', 'the', 'ha', '##o', 'da', 'z', '##a', 'ji', 'p', '##a', '?', '?', 'i', 'miss', 'the', 'food', 'there', 'so', 'much', '?', '?', '[SEP]']


# Encoding


In [None]:
ls = [len(sentence.split()) for sentence in train_sentences]
max_len = int(np.percentile(ls, 98))

def encode(sentence):
  encoding = tokenizer.encode_plus(
      sentence, 
      max_length = max_len,
      add_special_tokens = True,
      pad_to_max_length = True,
      return_attention_mask = True,
      return_token_type_ids = False,
      verbose = True
  )

  return encoding

train_sentences_encoded = [encode(sentence) for sentence in train_sentences]
test_sentences_encoded = [encode(sentence) for sentence in test_sentences]

# train_emotions_encoded = [encode(sentence) for sentence in train_emotions]
# test_emotions_encoded = [encode(sentence) for sentence in test_emotions]

# val_sentences_encoded = [encode(sentence) for sentence in val_sentences]
# val_emotions_encoded = [encode(sentence) for sentence in val_emotions]



TypeError: ignored

In [None]:
input_ids = []
input_mask = []

test_input_ids = []
test_input_mask = []

for encoding in train_sentences_encoded:
  for input_id in encoding['input_ids']:
    input_ids.append(input_id)

  for mask_value in encoding['attention_mask']:
    input_mask.append(mask_value)

 for encoding in test_sentences_encoded:
  for input_id in encoding['input_ids']:
    input_ids.append(input_id)

  for mask_value in encoding['attention_mask']:
    input_mask.append(mask_value) 

# Embedding

# Model

In [None]:
import tensorflow as tf
import bert
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

In [None]:
# input_ids = x_train['input_ids']
# attention_mask = x_train['attention_mask']

<tf.Tensor: shape=(24000, 70), dtype=int32, numpy=
array([[  101,   137,   188, ...,     0,     0,     0],
       [  101,   137, 11148, ...,     0,     0,     0],
       [  101,   146,  1125, ...,     0,     0,     0],
       ...,
       [  101,  1921,  5837, ...,     0,     0,     0],
       [  101,  8325,  2537, ...,     0,     0,     0],
       [  101,   137,  4267, ...,     0,     0,     0]], dtype=int32)>

In [None]:
# max_len = 70

input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0]

out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(6,activation = 'sigmoid')(out)

model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [115]:
optimizer = Adam(
    learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
# Set loss and metrics
loss =CategoricalCrossentropy(from_logits = True)
metric = CategoricalAccuracy('balanced_accuracy'),
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

keras.engine.keras_tensor.KerasTensor

In [125]:
train_history = model.fit(
    x = {'input_ids':input_ids,'attention_mask':input_mask} ,
    y = categorical_train_emotions,
    epochs=1,
    batch_size = 36
)





TypeError: ignored