In [1]:
 !pip install --quiet datasets

In [3]:
import torch
import pandas as pd
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import os
from accelerate import Accelerator
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-large-emotion-latest", device_map="cuda:0")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-large-emotion-latest", device_map="cuda:0")
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [5]:
# Freezing the base model
for param in model.parameters():
    param.requires_grad = False
# Unfreeze the last classifier layer
for param in model.classifier.parameters():
    param.requires_grad = True

In [6]:
pwd

'/hdd_storage/data/riyansha/NeuralSymbolicRegressionThatScales/emotion_class'

In [7]:
from google.colab import drive
drive.mount('/content/drive')

In [8]:
df = pd.read_csv('/content/drive/MyDrive/eng.csv')


In [9]:
type(df.iloc[0]['Anger'])

numpy.int64

In [10]:
# # List of emotions in the GoEmotions dataset
# emotion_labels = [
#     "admiration", "amusement", "anger", "annoyance", "approval",
#     "caring", "confusion", "curiosity", "desire", "disappointment",
#     "disapproval", "disgust", "embarrassment", "excitement", "fear",
#     "gratitude", "grief", "joy", "love", "nervousness",
#     "optimism", "pride", "realization", "relief", "remorse",
#     "sadness", "surprise", "neutral"
# ]

# # Example: Finding the emotion name for label number 3
# label_number = 3  # Using 0-based index for label number
# emotion_name = emotion_labels[label_number]
# print("The emotion is:", emotion_name)

# Setting the labels of our dataset in the same sequence as above
df['labels'] = df.apply(lambda row:[float(row['Anger']), float(row['Fear']), float(row['Joy']), float(row['Sadness']), float(row['Surprise'])], axis=1)
df = df.drop(columns=['id','Joy','Fear','Anger','Sadness','Surprise'])
df

Unnamed: 0,text,labels
0,But not very happy.,"[0.0, 0.0, 1.0, 1.0, 0.0]"
1,Well she's not gon na last the whole song like...,"[0.0, 0.0, 1.0, 0.0, 0.0]"
2,She sat at her Papa's recliner sofa only to mo...,"[0.0, 0.0, 0.0, 0.0, 0.0]"
3,"Yes, the Oklahoma city bombing.","[1.0, 1.0, 0.0, 1.0, 1.0]"
4,They were dancing to Bolero.,"[0.0, 0.0, 1.0, 0.0, 0.0]"
...,...,...
2763,"""Yeah, but did you just find that?","[0.0, 1.0, 0.0, 0.0, 1.0]"
2764,I did as little as possible with my right hand...,"[0.0, 0.0, 0.0, 0.0, 0.0]"
2765,"Okay that sucks, right?","[1.0, 0.0, 0.0, 1.0, 0.0]"
2766,"The spark leaped through his body into mine, a...","[0.0, 1.0, 0.0, 0.0, 1.0]"


In [11]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)  # Use only the desired columns

# Optionally, print the dataset to check
print(dataset)

Dataset({
    features: ['text', 'labels'],
    num_rows: 2768
})


In [12]:
def preprocess_function(examples):
    # Tokenize the text
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

    # Convert labels to torch float to match the expected input type for BCEWithLogitsLoss
    inputs["labels"] = [torch.tensor(label, dtype=torch.float) for label in examples["labels"]]

    return inputs

encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset

Map: 100%|██████████| 2768/2768 [00:00<00:00, 8236.76 examples/s]


Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 2768
})

In [13]:
print(len(encoded_dataset[0]["labels"]))

5


In [14]:
encoded_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [15]:
print(encoded_dataset[0]["labels"])  # Should output a tensor of floats: e.g., tensor([1., 0., 0., ..., 0., 1.])


tensor([0., 0., 1., 1., 0.])


In [16]:
dataset = encoded_dataset.train_test_split(test_size=0.2)

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 2214
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 554
    })
})

In [18]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, (tuple, list)):
        logits = logits[0]
    print(f"LOgits shape:  {logits.shape}")
    # Sigmoid is typically used for multi-label classification to get probabilities
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    # Convert probabilities to binary predictions (0 or 1)
    predictions = (probs > 0.5).astype(int)

    # Calculate metrics
    f1 = f1_score(labels, predictions, average='macro', zero_division=1)
    precision = precision_score(labels, predictions, average='macro', zero_division=1)
    recall = recall_score(labels, predictions, average='macro', zero_division=1)
    accuracy = accuracy_score(labels, predictions)  # Multi-label accuracy

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [19]:
model.config.output_hidden_states = True

class ModifiedModel(torch.nn.Module):
    def __init__(self, original_model):
        super(ModifiedModel, self).__init__()
        self.original_model = original_model
        self.fc = torch.nn.Linear(11, 5)  # New FC layer to map 11 -> 5 labels

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.original_model(input_ids, attention_mask=attention_mask)
        
        last_hidden_state = outputs.hidden_states[-1]  # Shape: [batch_size, seq_length, hidden_size]
        # mean pooling over the sequence dimension (dim=1) to get the fixed-length embedding
        embeddings = last_hidden_state.mean(dim=1)  # [batch_size, hidden_size]
        logits = outputs.logits  # [batch_size, 11]
        logits = self.fc(logits)  # [batch_size, 5]
        if labels is not None:
            loss = torch.nn.functional.binary_cross_entropy_with_logits(logits, labels.float())
            return {"loss": loss, "logits": logits, "last_hidden_state": last_hidden_state}

        return {"logits": logits, "last_hidden_state": last_hidden_state, "embeddings": embeddings}

modified_model = ModifiedModel(model)

In [20]:
from transformers import TrainingArguments, Trainer
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="no",
    save_steps=0,
    logging_strategy="epoch",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none"   # Disable wandb logging if not needed
)

# Initialize Trainer
trainer = Trainer(
    model=modified_model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

In [21]:

# modified_model = modified_model.to(device)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3984,0.362667,0.427798,0.692178,0.738925,0.665095
2,0.3436,0.347212,0.440433,0.689555,0.758857,0.640394
3,0.3264,0.337299,0.463899,0.710396,0.758916,0.673904
4,0.3173,0.336092,0.463899,0.721577,0.754139,0.695651
5,0.3101,0.333211,0.454874,0.712287,0.754452,0.679632


LOgits shape:  (554, 5)
LOgits shape:  (554, 5)
LOgits shape:  (554, 5)
LOgits shape:  (554, 5)
LOgits shape:  (554, 5)


TrainOutput(global_step=1385, training_loss=0.3391734925418124, metrics={'train_runtime': 173.1817, 'train_samples_per_second': 63.921, 'train_steps_per_second': 7.997, 'total_flos': 0.0, 'train_loss': 0.3391734925418124, 'epoch': 5.0})

##Using the trained model to get predictions on the train and dev sets

In [22]:
df_val = pd.read_csv('/content/drive/MyDrive/eng_a.csv')

df_val

Unnamed: 0,id,text,Anger,Fear,Joy,Sadness,Surprise
0,eng_dev_track_a_00001,"My mouth fell open `` No, no, no... I..",,,,,
1,eng_dev_track_a_00002,You can barely make out your daughter's pale f...,,,,,
2,eng_dev_track_a_00003,But after blinking my eyes for a few times lep...,,,,,
3,eng_dev_track_a_00004,Slowly rising to my feet I came to the conclus...,,,,,
4,eng_dev_track_a_00005,I noticed this months after moving in and doin...,,,,,
...,...,...,...,...,...,...,...
111,eng_dev_track_a_00112,"""ARcH stop your progression.",,,,,
112,eng_dev_track_a_00113,"This 'star', starts to move across the sky.",,,,,
113,eng_dev_track_a_00114,and my feet hurt.,,,,,
114,eng_dev_track_a_00115,so i cried my eyes out and did the drawing.,,,,,


In [23]:
df_val = df_val.drop(columns=['id','Joy','Fear','Anger','Sadness','Surprise'])
df_val

Unnamed: 0,text
0,"My mouth fell open `` No, no, no... I.."
1,You can barely make out your daughter's pale f...
2,But after blinking my eyes for a few times lep...
3,Slowly rising to my feet I came to the conclus...
4,I noticed this months after moving in and doin...
...,...
111,"""ARcH stop your progression."
112,"This 'star', starts to move across the sky."
113,and my feet hurt.
114,so i cried my eyes out and did the drawing.


In [24]:
val_dataset = Dataset.from_pandas(df_val)  # Use only the desired columns

# Optionally, print the dataset to check
print(val_dataset)

Dataset({
    features: ['text'],
    num_rows: 116
})


In [25]:
def preprocess_function_val(examples):
    # Tokenize the text
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

    # Convert labels to torch float to match the expected input type for BCEWithLogitsLoss
    # inputs["labels"] = [torch.tensor(label, dtype=torch.float) for label in examples["labels"]]

    return inputs

encoded_dataset_val = val_dataset.map(preprocess_function_val, batched=True)
encoded_dataset_val

Map: 100%|██████████| 116/116 [00:00<00:00, 6468.65 examples/s]


Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 116
})

In [26]:
encoded_dataset_val.set_format("torch", columns=["input_ids", "attention_mask"])
encoded_dataset_val

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 116
})

In [27]:
val_training_args = TrainingArguments(
    output_dir="./results",
    # per_device_eval_batch_size=8,
    # num_train_epochs=80,
    # weight_decay=0.01,
    report_to="none"   # Disable wandb logging if not needed
)
val_trainer = Trainer(
    model=modified_model,  # model is your trained model
    args=val_training_args,
    tokenizer=tokenizer
)

# Run predictions
predictions = val_trainer.predict(encoded_dataset_val)
pred_logits = predictions.predictions
if isinstance(pred_logits, (tuple, list)):
        pred_logits = pred_logits[0]
print(f"LOgits shape:  {pred_logits.shape}")

  val_trainer = Trainer(


LOgits shape:  (116, 5)


In [28]:
pred_labels = (torch.sigmoid(torch.tensor(pred_logits)) > 0.5).int()

pred_labels

In [29]:
df_val = pd.DataFrame(pred_labels, columns=['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise'])
df_val

Unnamed: 0,Anger,Fear,Joy,Sadness,Surprise
0,0,1,0,0,1
1,0,1,0,0,1
2,0,1,0,0,1
3,0,1,0,1,0
4,0,1,0,0,0
...,...,...,...,...,...
111,0,1,0,0,0
112,0,0,1,0,1
113,0,1,0,1,0
114,0,1,0,1,0


In [30]:
df_val.insert(loc=0, column='id', value='text')
df_val['id'] = [f'eng_dev_track_a_{i+1:05}' for i in range(len(df_val))]
df_val

Unnamed: 0,id,Anger,Fear,Joy,Sadness,Surprise
0,eng_dev_track_a_00001,0,1,0,0,1
1,eng_dev_track_a_00002,0,1,0,0,1
2,eng_dev_track_a_00003,0,1,0,0,1
3,eng_dev_track_a_00004,0,1,0,1,0
4,eng_dev_track_a_00005,0,1,0,0,0
...,...,...,...,...,...,...
111,eng_dev_track_a_00112,0,1,0,0,0
112,eng_dev_track_a_00113,0,0,1,0,1
113,eng_dev_track_a_00114,0,1,0,1,0
114,eng_dev_track_a_00115,0,1,0,1,0


In [31]:
df_val.to_csv('my_pred_after_FCLayer_and_lastlayer_twitterRoberta.csv', index=False)

In [32]:
df_XED = pd.read_csv('/content/drive/MyDrive/en-annotated.tsv', delimiter='\t')
df_XED

Unnamed: 0,", ...",1
0,!,"1, 4, 7"
1,... And I don't think we need to discuss the T...,"8, 1"
2,* So get up out of your bed,1
3,A confession that you hired [PERSON] ... and a...,"1, 6"
4,A dead man has one half - hour to raise his ro...,1
...,...,...
17522,Your opinion might be valuable .,8
17523,Your orders .,8
17524,Your ship's been in lots of battles .,8
17525,"Your wine , your Majesty .",8


In [33]:
label_XED = {0: "neutral", 1: "anger", 2: "anticipation", 3: "disgust", 4: "fear", 5: "joy", 6: "sadness", 7: "surprise", 8: "trust"}

def decode_labels(numeric_labels):
    return [label_XED[label] for label in numeric_labels]

In [34]:

df_XED.columns = ['text', 'labels']
df_XED['labels'] = df_XED['labels'].apply(lambda x: [int(num) for num in x.split(',')])

# df_eval = df_val_test.drop(columns=)
# df_eval


In [35]:
import numpy as np

num_classes = 9  

def generate_one_hot(labels, num_classes):
    one_hot = np.zeros(num_classes, dtype=float)
    for label in labels:
        one_hot[label] = 1
    return one_hot


df_XED['label'] = df_XED['labels'].apply(lambda x: generate_one_hot(x, num_classes))

In [36]:
df_XED = df_XED.drop(columns=['labels'])
df_XED

Unnamed: 0,text,label
0,!,"[0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0]"
1,... And I don't think we need to discuss the T...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]"
2,* So get up out of your bed,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,A confession that you hired [PERSON] ... and a...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
4,A dead man has one half - hour to raise his ro...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
...,...,...
17522,Your opinion might be valuable .,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]"
17523,Your orders .,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]"
17524,Your ship's been in lots of battles .,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]"
17525,"Your wine , your Majesty .","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]"


In [37]:
df_XED = df_XED.rename(columns={'label': 'labels'})

In [38]:
import torch


torch.save(modified_model, '/content/drive/MyDrive/Roberta_modified.pth')
tokenizer.save_pretrained('./model_tokenizer')


('./model_tokenizer/tokenizer_config.json',
 './model_tokenizer/special_tokens_map.json',
 './model_tokenizer/vocab.json',
 './model_tokenizer/merges.txt',
 './model_tokenizer/added_tokens.json',
 './model_tokenizer/tokenizer.json')

In [39]:
from transformers import AutoModel
from sklearn.metrics.pairwise import cosine_similarity


In [40]:
# Load the model
model_trained = torch.load("/content/drive/MyDrive/Roberta_modified.pth")
tokenizer = AutoTokenizer.from_pretrained('./model_tokenizer')
# model_trained.eval()


  model_trained = torch.load("/hdd_storage/data/riyansha/NeuralSymbolicRegressionThatScales/emotion_class/Roberta_modified.pth")


In [41]:
# Convert the 'Text' column to a list of sentences
text_traindata = df["text"].tolist()
text_XED = df_XED["text"].tolist()


In [42]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [43]:
# model_load = ModifiedModel(model)
# model_load = model_load.load_state_dict(torch.load("/hdd_storage/data/riyansha/NeuralSymbolicRegressionThatScales/emotion_class/Roberta_modified.pth"))

# Move the model to the device
model_trained = model_trained.to(device)


In [44]:
import numpy as np

def get_sentence_embedding(text, model, tokenizer, device):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Move the inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Get model outputs (with no gradients)
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs['embeddings'] 

    return embeddings.squeeze().cpu().numpy()  # Flatten for cosine similarity


In [45]:

embeddings_traindata = np.array([get_sentence_embedding(text, model_trained, tokenizer, device) for text in text_traindata])
embeddings_XED = np.array([get_sentence_embedding(text, model_trained, tokenizer, device ) for text in text_XED])


In [46]:
# Check the shape of your embeddings
print(embeddings_traindata.shape)  
print(embeddings_XED.shape) 


(2768, 1024)
(17527, 1024)


In [47]:
from sklearn.metrics.pairwise import cosine_similarity


# Ensure embeddings are 2D
embeddings_traindata_2d = np.array(embeddings_traindata)  # Shape should be (2768, embedding_size)
embeddings_newdata_2d = np.array(embeddings_XED)      # Shape should be (34792, embedding_size)

# Check the size of a single embedding
print(np.array(embeddings_traindata[0]).shape)

# Calculate pairwise cosine similarity between all samples in both datasets
similarity_matrix = cosine_similarity(embeddings_traindata_2d, embeddings_newdata_2d)

# Get the average similarity
avg_similarity = np.mean(similarity_matrix)
print("Average similarity:", avg_similarity)


(1024,)
Average similarity: 0.25399673
