In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

import re, string
import nltk

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split


from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel

import tensorflow as tf
from tensorflow import keras


from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df= pd.read_csv('../data/processed_twitter_data.csv')
print(df.head())

                                      processed_text  sentiment
0  USER URL aww that bummer you shoulda got david...          0
1  is upset that he can update his facebook by te...          0
2  USER dived many time for the ball managed to s...          0
3      my whole body feel itchy and like it on fire           0
4  USER no it not behaving at all mad why am here...          0


In [3]:
tokenizer_roberta = RobertaTokenizerFast.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



In [4]:
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['sentiment'], test_size=0.2, random_state=0)
print(f'Data Split into Training and Test sets done.')

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
print(f'Training set split into Training and Validation sets done.')

Data Split into Training and Test sets done.
Training set split into Training and Validation sets done.


In [5]:
MAX_LEN=100

In [6]:
def tokenize_roberta(data,max_len=MAX_LEN) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer_roberta.encode_plus(
            data.iloc[i],
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [7]:
train_input_ids, train_attention_masks = tokenize_roberta(X_train, MAX_LEN)
val_input_ids, val_attention_masks = tokenize_roberta(X_valid, MAX_LEN)
test_input_ids, test_attention_masks = tokenize_roberta(X_test, MAX_LEN)

In [None]:
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

In [9]:
def create_model(bert_model, max_len=MAX_LEN):
    
    opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
    loss = tf.keras.losses.BinaryCrossentropy()
    accuracy = tf.keras.metrics.BinaryAccuracy()

    input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')
    output = roberta_model([input_ids,attention_masks])[1]
    output = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)(output)
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(opt, loss=loss, metrics=accuracy)
    return model

In [10]:
model = create_model(roberta_model, MAX_LEN)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 100)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 100)]                0         []                            
                                                                                                  
 tf_roberta_model_1 (TFRobe  TFBaseModelOutputWithPooli   1246456   ['input_1[0][0]',             
 rtaModel)                   ngAndCrossAttentions(last_   32         'input_2[0][0]']             
                             hidden_state=(None, 100, 7                                           
                             68),                                                             

In [11]:
roberta_mod = model.fit([train_input_ids,train_attention_masks], y_train, validation_data=([val_input_ids,val_attention_masks], y_valid), epochs=3, batch_size=64)

Epoch 1/3


I0000 00:00:1727501098.689934      72 service.cc:145] XLA service 0x7bee26f33b90 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1727501098.689991      72 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1727501098.856582      72 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/3
Epoch 3/3

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [12]:
import pickle

with open('Sentiment-ROBERTa.pickle', 'wb') as file:
    pickle.dump(roberta_mod, file)



In [13]:
model.save('roberta_sentiment_model.h5')

  saving_api.save_model(


In [14]:
result_roberta = model.predict([test_input_ids,test_attention_masks])



In [16]:
y_pred_roberta = (result_roberta > 0.5).astype(int)

In [17]:
y_pred_roberta = y_pred_roberta.flatten()

In [18]:
conf_matrix = confusion_matrix(y_test, y_pred_roberta)
print('Confusion Matrix:\n', conf_matrix)

Confusion Matrix:
 [[142634  17181]
 [ 26783 133402]]


In [19]:
print('\tClassification Report for ROBERTa:\n\n', classification_report(y_test, y_pred_roberta, target_names=['Negative', 'Positive']))

	Classification Report for ROBERTa:

               precision    recall  f1-score   support

    Negative       0.84      0.89      0.87    159815
    Positive       0.89      0.83      0.86    160185

    accuracy                           0.86    320000
   macro avg       0.86      0.86      0.86    320000
weighted avg       0.86      0.86      0.86    320000



In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

accuracy = accuracy_score(y_test, y_pred_roberta)

precision = precision_score(y_test, y_pred_roberta)
recall = recall_score(y_test, y_pred_roberta)
f1 = f1_score(y_test, y_pred_roberta)

print('ROBERTa Model Metrics')
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

ROBERTa Model Metrics
Accuracy: 0.8626
Precision: 0.8859
Recall: 0.8328
F1 Score: 0.8585
