## Tweet Sentiment Analysis with RoBERTa

Goal: predict which part of a tweet could imply its sentiment.

### Acknowledgements

RoBERTa Glossary: https://huggingface.co/transformers/model_doc/roberta.html

I learned a lot by reading the following kernels on Kaggle:

* https://www.kaggle.com/vbmokin/tse2020-roberta-cnn-outlier-analysis-3chr/data
* https://www.kaggle.com/vbmokin/nlp-eda-bag-of-words-tf-idf-glove-bert
* https://www.kaggle.com/vbmokin/covid-19-week5-global-forecasting-eda-extratr
* https://www.kaggle.com/khoongweihao/tse2020-roberta-cnn-random-seed-distribution
* https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/142404#809872
* https://www.kaggle.com/seesee/faster-2x-tf-roberta
* https://www.kaggle.com/cdeotte/tf-roberta
* https://www.kaggle.com/abhishek/roberta-inference-5-folds
* https://www.kaggle.com/khoongweihao/tse2020-roberta-cnn-random-seed-distribution?scriptVersionId=34448972

### Add-ons
* Customized stopwords
* Lemmantizer
* Remove periods/comma, but keep asterisks
* Stemming
* Tuning and regularization
* Different learning rates for different epochs

*Note*: Stopwords, lemmantizer, removal of punctuations and stemming were not implemented in this project, as the goal is to predict which part of a text is a valid indication of sentiment. I added these methods here just to demonstrate data cleaning techniques in NLP.

In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set(style='white')
from mpl_toolkits.mplot3d import Axes3D


from sklearn.decomposition import PCA, TruncatedSVD
import math
import pickle


from scipy.spatial.distance import pdist
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

import tensorflow as tf
import tensorflow.keras.backend as K
from transformers import *
import tokenizers
from sklearn.model_selection import StratifiedKFold

import nltk
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from nltk.stem.wordnet import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

from nltk.stem import PorterStemmer
ps = PorterStemmer()

#not hiding long texts
pd.set_option('max_colwidth', 40)

### Cleaning and Importing Data

In [None]:
#input trainning data
train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv').fillna('')

In [3]:
#customize STOPWORDS
STOPWORDS -= {"mustn't","would", "but", "only", "too", "over", "with", "down", "against", "won't", "haven't", "below", "like", "all", "can't", "not", "isn't", "wouldn't", 'off', "doesn't", 'ought', "aren't","didn't", "don't", 'no', "couldn't", 'cannot','what',"wasn't", "weren't",'above','nor', "shan't", "however", "hadn't",  "up", "why"}
def remove_stopword(x):
    return [y for y in x.split() if y not in STOPWORDS]

#apply customized STOPWORDS
train.text = train.text.astype(str)
train.selected_text = train.selected_text.astype(str)
train['selected_text'] = train['selected_text'].apply(lambda x: " ".join(remove_stopword(x)))
train['text'] = train['text'].apply(lambda x: " ".join(remove_stopword(x)))

#remove the noise of empty texts
train = train[(train.text != '') & (train.selected_text != '')]

In [6]:
#stemming
train['selected_text'] = train['selected_text'].apply(lambda x: " ".join([ps.stem(word) for word in x.split()]))
train['text'] = train['text'].apply(lambda x: " ".join([ps.stem(word) for word in x.split()]))

In [7]:
#lemmantizer
def lemmatize_text(x):
    return [lemmatizer.lemmatize(w, 'v') for w in x.split()]
train['selected_text'] = train['selected_text'].apply(lambda x: " ".join(lemmatize_text(x)))
train['text'] = train['text'].apply(lambda x: " ".join(lemmatize_text(x)))

In [None]:
#remove part of punctuations; cannot remove all of them with regular expressions as some texts only contain punctuations
def remove_periods(x):
    return [word.translate(str.maketrans({',': '', '.': '', '|': ''})) for word in x.split()]
train['selected_text'] = train['selected_text'].apply(lambda x: " ".join(remove_periods(x)))
train['text'] = train['text'].apply(lambda x: " ".join(remove_periods(x)))

In [11]:
#don't need the column of text ids
train=train.reset_index(drop=True)

In [None]:
#tokenice training data for RoBERTa to understand
ct = train.shape[0]
input_ids = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')
start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(train.shape[0]):
    
    # FIND OVERLAP
    text1 = " "+" ".join(train.loc[k,'text'].split())
    text2 = " ".join(train.loc[k,'selected_text'].split())
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)]=1
    if text1[idx-1]==' ': chars[idx-1] = 1 
    enc = tokenizer.encode(text1)  
        
    # ID_OFFSETS
    offsets = []; idx=0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx,idx+len(w)))
        idx += len(w)
    
    # START END TOKENS
    toks = []
    for i,(a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm>0: toks.append(i) 
        
    s_tok = sentiment_id[train.loc[k,'sentiment']]
    input_ids[k,:len(enc.ids)+3] = [0, s_tok] + enc.ids + [2]
    attention_mask[k,:len(enc.ids)+3] = 1
    if len(toks)>0:
        start_tokens[k,toks[0]+2] = 1
        end_tokens[k,toks[-1]+2] = 1

In [12]:
#input test data
test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv').fillna('')

#tokenize test and training data in the same way
ct = test.shape[0]
input_ids_t = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask_t = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids_t = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(test.shape[0]):
        
    # INPUT_IDS
    text1 = " "+" ".join(test.loc[k,'text'].split())
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[test.loc[k,'sentiment']]
    input_ids_t[k,:len(enc.ids)+3] = [0, s_tok] + enc.ids + [2]
    attention_mask_t[k,:len(enc.ids)+3] = 1

### Tuning Hyperparameters <a class="anchor" id="3.1"></a>


In [13]:
#tune hyperparameters; learning rates were tuned inside of model along with each epoch
MAX_LEN = 106
EPOCHS = 7 
BATCH_SIZE = 32 
PAD_ID = 1
#set up seeds
SEED = 88888
tf.random.set_seed(SEED)
np.random.seed(SEED)
#to address overfitting and overconfidence
LABEL_SMOOTHING = 0.1
Dropout_new = 0.18     
#five folds
n_split = 5            

#specify the last sorting number of sentiments
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

### Training Model <a class="anchor" id="3.2"></a>


In [None]:
#pre-trained model path
PATH = '/kaggle/input/tf-roberta/'
#RoBERTa requires tokenizer
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'vocab-roberta-base.json', 
    merges_file=PATH+'merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)

In [14]:
#save the weight of a layer: h=WX+b, where W is the weight matrix
def save_weights(model, dst_fn):
    weights = model.get_weights()
    with open(dst_fn, 'wb') as f:
        pickle.dump(weights, f)

In [15]:
#set the weights by the saved weights
def load_weights(model, weight_fn):
    with open(weight_fn, 'rb') as f:
        weights = pickle.load(f)
    model.set_weights(weights)
    return model

In [16]:
#define logistic loss function
def loss_fn(y_true, y_pred):
    #select part of true labels corresponding to predictions
    ll = tf.shape(y_pred)[1]
    y_true = y_true[:, :ll]
    #log loss with label smoothing
    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred,
        from_logits=False, label_smoothing=LABEL_SMOOTHING)
    #compute the mean of all dimensions of a tensor: optimize the overall loss across all samples
    loss = tf.reduce_mean(loss)
    return loss

In [None]:
#define a networking model with Tensorflow
def build_model():
    #with functional API, create model's inputs and outputs
    #####inputs#####
    #initiate batch size and dtype
    #input ids: numerical representation of tokens building the sequences that will be used as model input
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    #attention mask: for the BertTokenizer, 1 indicaes attended indices, and 0 indicates padding indices
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    #token_type_ids: binary indicators to separate sentences, especially questions (1) and answers (0)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    
    #To put texts with different length in a tensor, I pad short texts then reduce to the max length
    padding = tf.cast(tf.equal(ids, PAD_ID), tf.int32)
    #max length - reduced sum across rows of padding
    lens = MAX_LEN - tf.reduce_sum(padding, -1)
    #keep the max across all dimensions
    max_len = tf.reduce_max(lens)
    #pick the same length and put in a tensor
    ids_ = ids[:, :max_len]
    att_ = att[:, :max_len]
    tok_ = tok[:, :max_len]
    
    #initialize configuration and then a model from the configuration
    config = RobertaConfig.from_pretrained(PATH+'config-roberta-base.json')
    #import a pretrained model from Kaggle
    bert_model = TFRobertaModel.from_pretrained(PATH+'pretrained-roberta-base.h5',config=config)
    #input hyperparameters
    x = bert_model(ids_,attention_mask=att_,token_type_ids=tok_)
    
    #####output#####
    #chain layer calls and specify model's forward pass
    #dropout (overfitting): randomly select input to set to 0 at rate = 0.18
    x1 = tf.keras.layers.Dropout(Dropout_new)(x[0])
    #apply zero padding to convolution layer
    x1 = tf.keras.layers.Conv1D(filters=768, kernel_size=2,padding='same')(x1)
    #apply leaky versoin of ReLU and generate output with the same shape
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(64, 2,padding='same')(x1)
    #output (1 dim space) = dot(input, weights matrix)
    x1 = tf.keras.layers.Dense(1)(x1)
    #add an extra channel dimension and output shapes are (batch, 1)
    x1 = tf.keras.layers.Flatten()(x1)
    #apply softmax activation function
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    #same as x1
    x2 = tf.keras.layers.Dropout(Dropout_new)(x[0]) 
    x2 = tf.keras.layers.Conv1D(768, 2,padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)
    
    #collect info of model together
    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    #use Adam algorithm as optimizer. tried others, such as Adamax based on infinity norm--didn't work well
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr) 
    #config logistic loss function and adam optimizer
    model.compile(loss=loss_fn, optimizer=optimizer)
    
    return model

In [17]:
#compute Jaccard Index (similarity scores)
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
#initialize starting and end points of prediction windows
preds_start = np.zeros((input_ids_t.shape[0], MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0], MAX_LEN))

#1==>print out
DISPLAY=1
for i in range(5):
    #mark each fold (4)
    print('#'*25)
    print('### MODEL %i'%(i+1))
    print('#'*25)
    
    #load weights from pretrained model
    K.clear_session()
    model = build_model()
    model.load_weights('/kaggle/input/model4/v4-roberta-%i.h5'%i)

    print('Predicting Test...')
    preds = model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=DISPLAY)
    #move forwards prediction windows
    preds_start += preds[0]/n_splits
    preds_end += preds[1]/n_splits

## Sample Prediction <a class="anchor" id="4"></a>


In [None]:
all = []
#output text one by one
for k in range(input_ids_t.shape[0]):
    #return the index of the maximum value
    a = np.argmax(preds_start[k,])
    b = np.argmax(preds_end[k,])
    #if start>end, otuput text directly
    if a>b: 
        st = test.loc[k,'text']
    #if start<=end:
    else:
        #use "+" as a separator to join text splitted by space
        text1 = " "+" ".join(test.loc[k,'text'].split())
        #tokenize 
        enc = tokenizer.encode(text1)
        #decode one by one
        st = tokenizer.decode(enc.ids[a-1:b])
    all.append(st)

In [None]:
#output 10 sample rows
test['selected_text'] = all
test[['textID','selected_text']].to_csv('sample.csv',index=False)
test.sample(10)