In [None]:
# https://medium.com/@yashvardhanvs/classification-using-pre-trained-bert-model-transfer-learning-2d50f404ed4c

In [None]:
!pip install transformers

!pip install wget
import wget

import pandas as pd
import numpy as np
import regex as re

from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer

import transformers
import keras

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalMaxPool1D, Embedding

from tensorflow.keras.models import Sequential

from sklearn.model_selection import train_test_split

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import string

from transformers import BertTokenizer, TFBertModel, BertConfig, TFBertForSequenceClassification

# Loading and Cleaning DataSet

In [None]:
df = pd.read_csv ('gdrive/MyDrive/Colab Notebooks/Sentiment Analysis/tweet_emotions.csv') # loading the dataset

df.drop('tweet_id', inplace = True, axis = 1) # removing tweet_id column
df.drop(df[(df['sentiment'] == 'empty')].index, inplace=True) # removing empty sentiment rows
df.rename(columns = {'content':'sentence'}, inplace = True) # renaming 'content' column to 'sentence'

df.replace(to_replace = ['fun', 'happiness'], value = "party", inplace = True) # fun, happiness <-> party
df.replace(to_replace = ['enthusiasm', 'surprise', 'love'], value = "happy", inplace = True) # enthusiasm, surprise, love <-> happy
df.replace(to_replace = ['sadness', 'anger', 'hate'], value = "sad", inplace = True) # sadness, anger <-> sad
df.replace(to_replace = ['boredom', 'worry'], value = "chill", inplace = True) # boredom, worry <-> chill
df.replace(to_replace = ['neutral', 'relief'], value = "normal", inplace = True) # neutral, relief <-> normal

df.dropna(inplace=True) # removing empty values

In [None]:
def clean_data(df):
  df['sentence'] = df['sentence'].apply(lambda x: re.sub('http\S+', "", x))
  df['sentence'] = df['sentence'].apply(lambda x: re.sub(r"\@\w+", "", x))
  df['sentence'] = df['sentence'].apply(lambda x: re.sub(r"\#\w+", "", x))
  df['sentence'] = df['sentence'].apply(lambda x: re.sub(r'[^\w]', " ", x))
  df['sentence'] = df['sentence'].apply(lambda x: re.sub(r"\............\w+", "", x))
  df['sentence'] = df['sentence'].apply(lambda x: " ".join(x.split()))

  sentiments_dict = {'party':0, 'happy':1, 'sad':2, 'chill':3, 'normal':4} # making sentiment dictionary

  for i in range(1, 40000):
    try:
      df['sentiment'][i] = sentiments_dict[df['sentiment'][i]] 
    except:
      continue

  return df

In [None]:
cleaned_data = clean_data(df)

# Splitting Data

In [None]:
tweets = cleaned_data.sentence.values # making an array of tweets
labels = cleaned_data.sentiment.values # making an array of sentiments\

In [None]:
train_input, val_input, train_label, val_label = train_test_split(tweets, labels)

In [None]:
print("Shape of train input and label should be the same")
print(train_input.shape)
print(train_label.shape)

Shape of train input and label should be the same
(29379,)
(29379,)


# Tokenizing

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
max_len = 0

for tweet in tweets:
  max_len = max(max_len, len(tweet))

print('Max len', max_len)

Max len 158


# Embedding

In [None]:
def mask_inputs_for_bert(tweets, max_len):
  input_ids = []
  attention_mask = []

  for tweet in tweets:
    encoded_dict = tokenizer.encode_plus(tweet, add_special_tokens = True, max_length = max_len, pad_to_max_length = True, return_attention_mask = True)
    input_ids.append(encoded_dict['input_ids'])
    attention_mask.append(encoded_dict['attention_mask'])

  input_ids = tf.convert_to_tensor(input_ids)
  attention_mask = tf.convert_to_tensor(attention_mask)

  return input_ids, attention_mask

In [None]:
train_input, train_mask = mask_inputs_for_bert(train_input, max_len)
val_input, val_mask = mask_inputs_for_bert(val_input, max_len)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
print(f"train input: {train_input.shape}")
print(f"mask input: {train_mask.shape}")
print()
print(f"val input: {val_input.shape}")
print(f"val mask: {val_mask.shape}")

train input: (29379, 158)
mask input: (29379, 158)

val input: (9794, 158)
val mask: (9794, 158)


In [None]:
train_label = np.asarray(train_label).astype(np.int32)
train_label = tf.convert_to_tensor(train_label)

val_label = np.asarray(val_label).astype(np.int32)
val_label = tf.convert_to_tensor(val_label)

In [None]:
print(f"train label: {train_label.shape}")
print(f"val label: {val_label.shape}")

train label: (29379,)
val label: (9794,)


# Training

In [None]:
from transformers import TFBertForSequenceClassification
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 5)

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
log_dir = 'gdrive/MyDrive/Colab Notebooks/Sentiment Analysis'
model_save_path = 'gdrive/MyDrive/Colab Notebooks/Sentiment Analysis/bert_model.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,
                                                save_weights_only=True,
                                                monitor='val_loss',
                                                mode='min',
                                                save_best_only=True),
                                                keras.callbacks.TensorBoard(log_dir=log_dir)]

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy ('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)

In [None]:
bert_model.compile(loss=loss, optimizer=optimizer, metrics=[metric])

In [None]:
history = bert_model.fit([train_input, train_mask],
                         train_label,
                         batch_size = 128,
                         epochs = 4,
                         validation_data = ([val_input, val_mask], val_label),
                         callbacks=callbacks)

Epoch 1/4
