In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\purva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\purva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# Data read
df=pd.read_csv("isear_processed_dataset.csv")

In [None]:
df

Unnamed: 0,cleaned_text,labels
0,during the period of falling in love each time...,joy
1,when i was involved in a traffic accident,fear
2,when i was driving home after several days of ...,anger
3,when i lost the person who meant the most to me,sadness
4,the time i knocked a deer down the sight of th...,disgust
...,...,...
7661,two years back someone invited me to be the tu...,anger
7662,i had taken the responsibility to do something...,sadness
7663,i was at home and i heard a loud sound of spit...,disgust
7664,i did not do the homework that the teacher had...,shame


In [None]:
# Check if there are null values in the dataset
df=df[~(df['cleaned_text'].isnull())]
df

Unnamed: 0,cleaned_text,labels
0,during the period of falling in love each time...,joy
1,when i was involved in a traffic accident,fear
2,when i was driving home after several days of ...,anger
3,when i lost the person who meant the most to me,sadness
4,the time i knocked a deer down the sight of th...,disgust
...,...,...
7661,two years back someone invited me to be the tu...,anger
7662,i had taken the responsibility to do something...,sadness
7663,i was at home and i heard a loud sound of spit...,disgust
7664,i did not do the homework that the teacher had...,shame


## Text Preprocessing
- Tokenization: Splitting the text into individual words or tokens
- Lemmatization: Reducing words to their base form
- Stop Word Removal: Removing Stop words that are not useful for training, eg: the, is, and

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def tokenize_lemmatize_remove_stopwords(text):
    tokens = word_tokenize(text)  # Tokenization
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]  # Lemmatization and stop words removal

    return ' '.join(tokens)  # Join tokens into string


In [None]:
# Call the tokenize_lemmatize_remove_stopwords() function to the 'cleaned_text' column
df['processed_text'] = df['cleaned_text'].apply(tokenize_lemmatize_remove_stopwords)

In [None]:
df

Unnamed: 0,cleaned_text,labels,processed_text
0,during the period of falling in love each time...,joy,period falling love time met especially met lo...
1,when i was involved in a traffic accident,fear,involved traffic accident
2,when i was driving home after several days of ...,anger,driving home several day hard work motorist ah...
3,when i lost the person who meant the most to me,sadness,lost person meant
4,the time i knocked a deer down the sight of th...,disgust,time knocked deer sight animal injury helpless...
...,...,...,...
7661,two years back someone invited me to be the tu...,anger,two year back someone invited tutor granddaugh...
7662,i had taken the responsibility to do something...,sadness,taken responsibility something prepared howeve...
7663,i was at home and i heard a loud sound of spit...,disgust,home heard loud sound spitting outside door th...
7664,i did not do the homework that the teacher had...,shame,homework teacher asked u scolded immediately


In [None]:
new_df=pd.DataFrame()

In [None]:
new_df["text"]=df["processed_text"]
new_df["labels"]=df["labels"]

In [None]:
new_df

Unnamed: 0,text,labels
0,period falling love time met especially met lo...,joy
1,involved traffic accident,fear
2,driving home several day hard work motorist ah...,anger
3,lost person meant,sadness
4,time knocked deer sight animal injury helpless...,disgust
...,...,...
7661,two year back someone invited tutor granddaugh...,anger
7662,taken responsibility something prepared howeve...,sadness
7663,home heard loud sound spitting outside door th...,disgust
7664,homework teacher asked u scolded immediately,shame


In [None]:
new_df = pd.get_dummies(new_df, columns=['labels'])

In [None]:
new_df

Unnamed: 0,text,labels_anger,labels_disgust,labels_fear,labels_guilt,labels_joy,labels_sadness,labels_shame
0,period falling love time met especially met lo...,False,False,False,False,True,False,False
1,involved traffic accident,False,False,True,False,False,False,False
2,driving home several day hard work motorist ah...,True,False,False,False,False,False,False
3,lost person meant,False,False,False,False,False,True,False
4,time knocked deer sight animal injury helpless...,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...
7661,two year back someone invited tutor granddaugh...,True,False,False,False,False,False,False
7662,taken responsibility something prepared howeve...,False,False,False,False,False,True,False
7663,home heard loud sound spitting outside door th...,False,True,False,False,False,False,False
7664,homework teacher asked u scolded immediately,False,False,False,False,False,False,True


## Data Preparation for model training: Splitting,Tokenization, Encoding and Label extraction
- Splitting the data
    Train size: 80%, Test size: 10%, Validation size: 10%
- Tokenization 
- Encoding: The tokenized text data is transformed into tensors. The "input_ids" represent the tokenized words, and the "attention_mask" indicates which tokens should be attended to or ignored (e.g., padding tokens)
- Label Extraction

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

train_encoded = tokenizer(train_df['text'].tolist(), padding=True, truncation=True, return_tensors='tf')
val_encoded = tokenizer(val_df['text'].tolist(), padding=True, truncation=True, return_tensors='tf')
test_encoded = tokenizer(test_df['text'].tolist(), padding=True, truncation=True, return_tensors='tf')

train_inputs = {'input_ids': train_encoded['input_ids'], 'attention_mask': train_encoded['attention_mask']}
val_inputs = {'input_ids': val_encoded['input_ids'], 'attention_mask': val_encoded['attention_mask']}
test_inputs = {'input_ids': test_encoded['input_ids'], 'attention_mask': test_encoded['attention_mask']}

train_labels = np.array(train_df.iloc[:, 1:])  
val_labels = np.array(val_df.iloc[:, 1:])
test_labels = np.array(test_df.iloc[:, 1:])


In [None]:
train_df.shape, test_df.shape, val_df.shape, train_labels.shape, test_labels.shape, val_labels.shape

((6010, 8), (752, 8), (751, 8), (6010, 7), (752, 7), (751, 7))

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_labels)).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((val_inputs, val_labels)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((test_inputs, test_labels)).batch(32)


In [None]:
# Print one batch of the dataset 
for batch in train_dataset:
    input_ids = batch[0]['input_ids']
    attention_mask = batch[0]['attention_mask']
    
    print("Input IDs:\n", input_ids.numpy())
    print("Attention Mask:\n", attention_mask.numpy())
    break  # After printing one batch break from the loop


Input IDs:
 [[    0 41642   213 ...     1     1     1]
 [    0  1208 13356 ...     1     1     1]
 [    0  1397  1291 ...     1     1     1]
 ...
 [    0 19010  5537 ...     1     1     1]
 [    0 31828   428 ...     1     1     1]
 [    0 37945 24320 ...     1     1     1]]
Attention Mask:
 [[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]]


In [None]:
train_labels

array([[False, False, False, ...,  True, False, False],
       [False, False, False, ..., False,  True, False],
       [False, False, False, ..., False,  True, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False,  True, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [None]:
train_dataset

<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 83), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 83), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 7), dtype=tf.bool, name=None))>

## Roberta-bilstm

In [None]:
def create_model():
    roberta_model = TFRobertaModel.from_pretrained('roberta-base')
    
    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask')
    
    roberta_output = roberta_model(input_ids, attention_mask=attention_mask)[0]
    
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=False))(roberta_output)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    output = tf.keras.layers.Dense(7, activation='softmax')(x) 
    
    model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

model = create_model()
model.summary()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.embeddings.position_ids', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, None)]               0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 tf_roberta_model_1 (TFRobe  TFBaseModelOutputWithPooli   1246456   ['input_ids[0][0]',           
 rtaModel)                   ngAndCrossAttentions(last_   32         'attention_mask[0][0]']      
                             hidden_state=(None, None,                                        

In [None]:
epochs = 3

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=epochs
)


Epoch 1/3


Epoch 2/3
Epoch 3/3


In [None]:
test_loss, test_accuracy = model.evaluate(test_dataset)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


Test Loss: 0.23011678457260132
Test Accuracy: 0.671447217464447


## Roberta-lstm

In [None]:
def create_model():
    roberta_model = TFRobertaModel.from_pretrained('roberta-base')
    
    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask')
    
    roberta_output = roberta_model(input_ids, attention_mask=attention_mask)[0]
    
    x = tf.keras.layers.LSTM(128, return_sequences=False)(roberta_output)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(300, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    output = tf.keras.layers.Dense(7, activation='softmax')(x) 
    
    model_l = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output)
    model_l.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model_l

model_l = create_model()
model_l.summary()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['roberta.embeddings.position_ids', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, None)]               0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 tf_roberta_model_1 (TFRobe  TFBaseModelOutputWithPooli   1246456   ['input_ids[0][0]',           
 rtaModel)                   ngAndCrossAttentions(last_   32         'attention_mask[0][0]']      
                             hidden_state=(None, None,                                      

In [None]:
epochs = 3

history_l = model_l.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=epochs
)


Epoch 1/3


Epoch 2/3
Epoch 3/3


In [None]:
test_loss_l, test_accuracy_l = model_l.evaluate(test_dataset)

print(f"Test Loss: {test_loss_l}")
print(f"Test Accuracy: {test_accuracy_l}")


Test Loss: 0.25692036747932434
Test Accuracy: 0.6323337554931641


## Roberta-gru

In [None]:
def create_model_gr():
    roberta_model = TFRobertaModel.from_pretrained('roberta-base')
    
    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask')
    
    roberta_output = roberta_model(input_ids, attention_mask=attention_mask)[0]
    
    x = tf.keras.layers.GRU(128, return_sequences=False)(roberta_output)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    output = tf.keras.layers.Dense(7, activation='softmax')(x) 
    
    model_gr = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output)
    model_gr.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model_gr

model_gr = create_model_gr()
model_gr.summary()





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, None)]               0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 tf_roberta_model (TFRobert  TFBaseModelOutputWithPooli   1246456   ['input_ids[0][0]',           
 aModel)                     ngAndCrossAttentions(last_   32         'attention_mask[0][0]']      
                             hidden_state=(None, None,                                        

In [None]:
epochs = 3

history_gr = model_gr.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=epochs
)


Epoch 1/3


Epoch 2/3
Epoch 3/3


In [None]:
test_loss_gr, test_accuracy_gr = model_gr.evaluate(test_dataset)

print(f"Test Loss: {test_loss_gr}")
print(f"Test Accuracy: {test_accuracy_gr}")


Test Loss: 1.1445326805114746
Test Accuracy: 0.6502659320831299
