In [None]:
from transformers import RobertaModel, RobertaTokenizer
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

In [None]:
#download the data here = https://affective-meld.github.io/
df_train_final = pd.read_csv(r"C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/v2/train.csv") #please put your path
df_dev_final = pd.read_csv(r"C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/v2/dev.csv") #please put your path
df_test_final = pd.read_csv(r"C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/v2/test.csv") #please put your path

In [None]:
print(df_train_final.shape, df_dev_final.shape,df_test_final.shape)

In [None]:
df_train_final

In [None]:
df_train_final['sentiment'] = df_train_final['sentiment'].astype('category')
encode_map = {'negative': 0,'neutral': 1,'positive': 2}
df_train_final['sentiment'].replace(encode_map, inplace=True)

In [None]:
df_dev_final['sentiment'] = df_dev_final['sentiment'].astype('category')
encode_map = {'negative': 0,'neutral': 1,'positive': 2}
df_dev_final['sentiment'].replace(encode_map, inplace=True)

In [None]:
df_test_final['sentiment'] = df_test_final['sentiment'].astype('category')
encode_map = {'negative': 0,'neutral': 1,'positive': 2}
df_test_final['sentiment'].replace(encode_map, inplace=True)

In [None]:
df_train_final = df_train_final.rename(columns={"name": "file_ID"})
df_dev_final = df_dev_final.rename(columns={"name": "file_ID"})
df_test_final = df_test_final.rename(columns={"name": "file_ID"})

In [None]:
frames = [df_train_final, df_dev_final, df_test_final]
combine = pd.concat(frames)
all_data = combine.reset_index(drop=True)

In [None]:
len(all_data)

In [None]:
all_data.head(5) #neg=0, neu=1, pos=2

In [None]:
LABEL_COLUMNS = all_data.columns.tolist()[2]

In [None]:
MAX_TOKEN_COUNT = 60

In [None]:
MODEL_NAME = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

### Tokenization 

In [None]:
class Dataset(Dataset):
    
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: RobertaTokenizer,
        max_token_len: int = 60
    ):

        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        
        data_row = self.data.iloc[index]
        
        Utterance = data_row.transcription
        labels = data_row[LABEL_COLUMNS] ##all
        fileID = data_row.file_ID

        encoding = self.tokenizer.encode_plus(
            Utterance,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return dict(
            Utterance=Utterance,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=labels,
            FileID=fileID

        )

### Data loaders 

In [None]:
def get_dataloaders(batch_size, max_token_len=60):
    
    train_dataset = Dataset(
    df_train_final,
    tokenizer,
    max_token_len)
        
    train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0)
    
    dev_dataset = Dataset(
    df_dev_final,
    tokenizer,
    max_token_len)
        
    dev_loader = DataLoader(
    dev_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0)
    
    test_dataset = Dataset(
    df_test_final,
    tokenizer,
    max_token_len)
        
    test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0)
    
    return train_loader, dev_loader, test_loader

### Embedding model 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

In [None]:
cpu_Device = torch.device("cpu")

In [None]:
class TextEmbeddingModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(MODEL_NAME, return_dict=True, output_hidden_states=True).to(device)

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.roberta(input_ids, attention_mask=attention_mask)
        #output = output.pooler_output
        
        #return output
        
        hidden_states = output[2]
        # get last four layers
        last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
        # cast layers to a tuple and concatenate over the last dimension
        cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)
        # take the mean of the concatenated vector over the token dimension
        cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1)
        
        return cat_sentence_embedding
    
model = TextEmbeddingModel()

### Extracting train embeddings 

In [None]:
train_dataset = Dataset(df_train_final,tokenizer,max_token_len=MAX_TOKEN_COUNT)

In [None]:
counter = 0
embeddings = {"embeddings" : [], "labels": [], "fileID": []}

with torch.no_grad():
    for i in train_dataset:
        counter += 1
        input_ids = torch.unsqueeze(i["input_ids"],0).to(device)
        attention_mask = torch.unsqueeze(i["attention_mask"],0).to(device)
        labels = i['labels']
        fileID = i['FileID']
    
        results = model(input_ids=input_ids, attention_mask=attention_mask)
        #results.to(cpu_device)
    
        embeddings["embeddings"].append(results)
        embeddings["labels"].append(labels)
        embeddings["fileID"].append(fileID)

In [None]:
counter

In [None]:
embeddings

#### Saving and loading tensors 

In [None]:
PATH = "C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/embeddings_v2/train.pt"
torch.save(embeddings, PATH)

In [None]:
embgs = torch.load(PATH)

In [None]:
embgs

### Extracting dev embeddings 

In [None]:
dev_dataset = Dataset(df_dev_final,tokenizer,max_token_len=MAX_TOKEN_COUNT)

In [None]:
counter = 0
embeddings = {"embeddings" : [], "labels": [], "fileID": []}

with torch.no_grad():
    for i in dev_dataset:
        counter += 1
        input_ids = torch.unsqueeze(i["input_ids"],0).to(device)
        attention_mask = torch.unsqueeze(i["attention_mask"],0).to(device)
        labels = i['labels']
        fileID = i['FileID']
    
        results = model(input_ids=input_ids, attention_mask=attention_mask)
    
        embeddings["embeddings"].append(results)
        embeddings["labels"].append(labels)
        embeddings["fileID"].append(fileID)

In [None]:
counter

In [None]:
embeddings

#### Saving and loading tensors

In [None]:
PATH = "C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/embeddings_v2/dev.pt"
torch.save(embeddings, PATH)

In [None]:
embgs = torch.load(PATH)

In [None]:
embgs

### Extracting test embeddings 

In [None]:
test_dataset = Dataset(df_test_final,tokenizer,max_token_len=MAX_TOKEN_COUNT)

In [None]:
counter = 0
embeddings = {"embeddings" : [], "labels": [], "fileID": []}

with torch.no_grad():
    for i in test_dataset:
        counter += 1
        input_ids = torch.unsqueeze(i["input_ids"],0).to(device)
        attention_mask = torch.unsqueeze(i["attention_mask"],0).to(device)
        labels = i['labels']
        fileID = i['FileID']
    
        results = model(input_ids=input_ids, attention_mask=attention_mask)
    
        embeddings["embeddings"].append(results)
        embeddings["labels"].append(labels)
        embeddings["fileID"].append(fileID)

In [None]:
counter

In [None]:
embeddings

#### Saving and loading tensors 

In [None]:
PATH = "C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/embeddings_v2/test.pt"
torch.save(embeddings, PATH)

In [None]:
embgs = torch.load(PATH)

In [None]:
embgs

### Example: 

In [None]:
encoding = Dataset(
  df_train_final,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

encoding[0]

In [None]:
print(encoding[0]["input_ids"].size())
print(type(encoding[0]["input_ids"]))

In [None]:
input_ids = torch.unsqueeze(encoding[0]["input_ids"],0).to(device)
attention_mask = torch.unsqueeze(encoding[0]["attention_mask"],0).to(device)

In [None]:
input_ids.size()

In [None]:
embeddings = model(input_ids=input_ids, attention_mask=attention_mask)
print(embeddings)
print(type(embeddings))
print(len(embeddings))
#print(embeddings.size())

### Last four layers embeddings example:

In [None]:
hidden_states = embeddings[2]
print(len(hidden_states))

In [None]:
# get last four layers
last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
# cast layers to a tuple and concatenate over the last dimension
cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)
print(cat_hidden_states.size())

# take the mean of the concatenated vector over the token dimension
cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze()
print(cat_sentence_embedding)
print(cat_sentence_embedding.size())

### Extracting train embeddings {last four layers}: 

In [None]:
train_dataset = Dataset(df_train_final,tokenizer,max_token_len=MAX_TOKEN_COUNT)

In [None]:
counter = 0
embeddings = {"embeddings" : [], "labels": [], "fileID": []}

with torch.no_grad():
    for i in train_dataset:
        counter += 1
        input_ids = torch.unsqueeze(i["input_ids"],0).to(device)
        attention_mask = torch.unsqueeze(i["attention_mask"],0).to(device)
        labels = i['labels']
        fileID = i['FileID']
    
        results = model(input_ids=input_ids, attention_mask=attention_mask)
    
        embeddings["embeddings"].append(results)
        embeddings["labels"].append(labels)
        embeddings["fileID"].append(fileID)

In [None]:
print(embeddings["embeddings"][0].size())

In [None]:
counter

In [None]:
print(embeddings)

#### Saving and loading tensors 

In [None]:
PATH = "C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/embeddings_v2/train_lfl.pt"
torch.save(embeddings, PATH)

In [None]:
embgs = torch.load(PATH)

### Extracting dev embeddings {last four layers}:  

In [None]:
dev_dataset = Dataset(df_dev_final,tokenizer,max_token_len=MAX_TOKEN_COUNT)

In [None]:
counter = 0
embeddings = {"embeddings" : [], "labels": [], "fileID": []}

with torch.no_grad():
    for i in dev_dataset:
        counter += 1
        input_ids = torch.unsqueeze(i["input_ids"],0).to(device)
        attention_mask = torch.unsqueeze(i["attention_mask"],0).to(device)
        labels = i['labels']
        fileID = i['FileID']
    
        results = model(input_ids=input_ids, attention_mask=attention_mask)
    
        embeddings["embeddings"].append(results)
        embeddings["labels"].append(labels)
        embeddings["fileID"].append(fileID)

In [None]:
print(embeddings["embeddings"][0].size())

In [None]:
counter

In [None]:
print(embeddings)

#### Saving and loading tensors 

In [None]:
PATH = "C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/embeddings_v2/dev_lfl.pt"
torch.save(embeddings, PATH)

In [None]:
embgs = torch.load(PATH)

### Extracting test embeddings {last four layers}: 

In [None]:
test_dataset = Dataset(df_test_final,tokenizer,max_token_len=MAX_TOKEN_COUNT)

In [None]:
counter = 0
embeddings = {"embeddings" : [], "labels": [], "fileID": []}

with torch.no_grad():
    for i in test_dataset:
        counter += 1
        input_ids = torch.unsqueeze(i["input_ids"],0).to(device)
        attention_mask = torch.unsqueeze(i["attention_mask"],0).to(device)
        labels = i['labels']
        fileID = i['FileID']
    
        results = model(input_ids=input_ids, attention_mask=attention_mask)
    
        embeddings["embeddings"].append(results)
        embeddings["labels"].append(labels)
        embeddings["fileID"].append(fileID)

In [None]:
print(embeddings["embeddings"][0].size())

In [None]:
counter

In [None]:
print(embeddings)

#### Saving and loading tensors 

In [None]:
PATH = "C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/embeddings_v2/test_lfl.pt"
torch.save(embeddings, PATH)

In [None]:
embgs = torch.load(PATH)