### Binary Classification with classes combined on the Best Performing Model

## Necessary Packages

In [2]:
import pandas as pd
from transformers import RobertaTokenizer
from transformers import TFRobertaModel
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report

## Dataset Loading

In [3]:

# Loading the entire dataset
data = pd.read_csv('train.csv')

# Sampling 10% of the data
sampled_data = data.sample(frac=0.1, random_state=42)

# Saving the test data for later use
test_data = pd.read_csv('test.csv')


In [4]:
def preprocess_text(text):
    if isinstance(text, str):
        import re
        text = re.sub(r'http\S+|www\S+|@\w+|#\w+|[^A-Za-z0-9\s]', '', text)
        text = text.lower()
    else:
        text = ""
    return text

# Applying preprocessing
sampled_data['tweets'] = sampled_data['tweets'].astype(str).fillna('')
sampled_data['clean_tweets'] = sampled_data['tweets'].apply(preprocess_text)
test_data['tweets'] = test_data['tweets'].astype(str).fillna('')
test_data['clean_tweets'] = test_data['tweets'].apply(preprocess_text)

# Ensuring no NaN values in the 'class' column
sampled_data['class'] = sampled_data['class'].fillna(-1)
test_data['class'] = test_data['class'].fillna(-1)
test_data['class']

0       figurative
1       figurative
2       figurative
3       figurative
4       figurative
           ...    
8123       sarcasm
8124       sarcasm
8125       sarcasm
8126       sarcasm
8127       sarcasm
Name: class, Length: 8128, dtype: object

In [6]:
# Mapping labels for binary classification
sampled_data['binary_class'] = sampled_data['class'].apply(lambda x: 1 if x == 'sarcasm' else 0)
test_data['binary_class'] = test_data['class'].apply(lambda x: 1 if x == 'sarcasm' else 0)
test_data['binary_class']

0       0
1       0
2       0
3       0
4       0
       ..
8123    1
8124    1
8125    1
8126    1
8127    1
Name: binary_class, Length: 8128, dtype: int64

In [7]:

# Initializing the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_length = 50

# Tokenizing the training data
X_train_tokens = tokenizer.batch_encode_plus(
    sampled_data['clean_tweets'].tolist(),
    max_length=max_length,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)

# Tokenizing the test data
X_test_tokens = tokenizer.batch_encode_plus(
    test_data['clean_tweets'].tolist(),
    max_length=max_length,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)

# Extracting the labels
y_train = sampled_data['binary_class'].values
y_test = test_data['binary_class'].values
y_test

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [21]:


# Defining the RoBERTa-based model
input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

roberta_model = TFRobertaModel.from_pretrained('roberta-base')
roberta_output = roberta_model(input_ids, attention_mask=attention_mask)[0]
cls_token = roberta_output[:, 0, :]  # Extract the CLS token

output = Dense(1, activation='sigmoid')(cls_token)  # Single unit for binary classification
roberta_classifier = Model(inputs=[input_ids, attention_mask], outputs=output)

roberta_classifier.compile(optimizer=Adam(learning_rate=2e-5), loss='binary_crossentropy', metrics=['accuracy'])

# Training the model
history = roberta_classifier.fit(
    {'input_ids': X_train_tokens['input_ids'], 'attention_mask': X_train_tokens['attention_mask']},
    y_train,
    epochs=3,
    batch_size=32,
    validation_split=0.1
)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['roberta.embeddings.position_ids', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [22]:


# Making predictions
predictions = roberta_classifier.predict({'input_ids': X_test_tokens['input_ids'], 'attention_mask': X_test_tokens['attention_mask']})

# Converting predictions to binary labels
y_pred = (predictions > 0.5).astype(int).flatten()  # Flatten to 1D array

# Generating the classification report
report = classification_report(y_test, y_pred, target_names=['regular', 'sarcasm'])
print("RoBERTa Classification Report:\n", report)


RoBERTa Classification Report:
               precision    recall  f1-score   support

     regular       0.81      0.89      0.85      6023
     sarcasm       0.56      0.41      0.47      2105

    accuracy                           0.76      8128
   macro avg       0.69      0.65      0.66      8128
weighted avg       0.75      0.76      0.75      8128

