# Reading data

In [1]:
import pandas as pd

file_path = "mustard++/dataframe.csv"

df = pd.read_csv(file_path)
df = df.iloc[:, :-5]

df.head(20)

Unnamed: 0,SCENE,KEY,SENTENCE,END_TIME,SPEAKER,SHOW,Sarcasm
0,1_10004,1_10004_c_00,"Well, I'm sure that, uh, you...\nhave a lot of...",0:06,PERSON,BBT,
1,1_10004,1_10004_c_01,Who was he?,0:08,SHELDON,BBT,
2,1_10004,1_10004_c_02,His name is Ron.\nI met him at my prayer group.,0:12,PERSON,BBT,
3,1_10004,1_10004_c_03,How long have you been involved with him?,0:14,SHELDON,BBT,
4,1_10004,1_10004_c_04,A few months.,0:16,PERSON,BBT,
5,1_10004,1_10004_u,"And of those few months, how long have you bee...",0:07,SHELDON,BBT,0.0
6,1_10009,1_10009_c_00,"FYI, we plan on selling out the human race hard.",0:02,AMY,BBT,
7,1_10009,1_10009_c_01,"In 20 years, who knows what'll happen with any...",0:08,PENNY,BBT,
8,1_10009,1_10009_c_02,I think you and Leonard will be together.,0:1,PERSON,BBT,
9,1_10009,1_10009_c_03,You do?,0:11,PENNY,BBT,


# Preprocess data
Creating a dataframe with two columns such that one column is concatenated context and another column is the utterance text.

In [2]:
import pandas as pd

df = df.sort_values(['SCENE', 'KEY'])

df['SPEAKER_SENTENCE'] = df['SPEAKER'] + ': ' + df['SENTENCE']

# Create a new dataframe for the transformed data
df_new = pd.DataFrame(columns = ['SCENE', 'SENTENCE_A', 'SENTENCE_B', 'Sarcasm'])

prev_scene = ''
sentence_a = ''
sentence_b = ''
sarcasm_label = None

for index, row in df.iterrows():
    if row['SCENE'] != prev_scene:
        # new conversation starts
        sentence_a = '[CLS] ' + row['SPEAKER_SENTENCE']
        sentence_b = ''
    elif 'c' in row['KEY']:
        # same conversation, add to context
        sentence_a += ' [SEP] ' + row['SPEAKER_SENTENCE']
    elif 'u' in row['KEY']:
        # it's the utterance sentence
        sentence_b = ' [SEP] ' + row['SPEAKER_SENTENCE'] + ' [SEP]'
        sarcasm_label = row['Sarcasm']

        df_temp = pd.DataFrame([[row['SCENE'], sentence_a, sentence_b, sarcasm_label]], columns=['SCENE', 'SENTENCE_A', 'SENTENCE_B', 'Sarcasm'])
        df_new = pd.concat([df_new, df_temp], ignore_index=True)
        # Resetting the context and utterance for the next scene
        sentence_a = ''
        sentence_b = ''
        
    prev_scene = row['SCENE']

df_new["SENTENCE_A"] = df_new["SENTENCE_A"].str.replace('\n', '')
df_new["SENTENCE_B"] = df_new["SENTENCE_B"].str.replace('\n', '')

# print new dataframe
df_new

Unnamed: 0,SCENE,SENTENCE_A,SENTENCE_B,Sarcasm
0,1_10004,"[CLS] PERSON: Well, I'm sure that, uh, you...h...","[SEP] SHELDON: And of those few months, how l...",0.0
1,1_10009,"[CLS] AMY: FYI, we plan on selling out the hum...","[SEP] PENNY: Let the dead man talk. So, why d...",0.0
2,1_1001,[CLS] SHELDON: Or maybe she just doesn't want ...,"[SEP] RAJ: What else? Sell it on eBay as ""sli...",0.0
3,1_1003,[CLS] HOWARD: It's smashed beyond repair. What...,"[SEP] HOWARD: Good idea, sit with her. Hold h...",1.0
4,1_10190,[CLS] PENNY: it's important to the story that ...,"[SEP] SHELDON: Well, now that I've given up s...",0.0
...,...,...,...,...
1197,3_S06E02_398,[CLS] -: There's a reason Jared tried to ditch...,"[SEP] OTHER: Look, we cannot take blood money...",0.0
1198,3_S06E03_366,"[CLS] RICHARD: You guys, there's really no oth...",[SEP] RICHARD: The-the same way we can buy Am...,1.0
1199,3_S06E05_355,"[CLS] MONICA: -What? -Sorry. I just, um, [SEP]...","[SEP] OTHER: Well, maybe some time when you'r...",1.0
1200,3_S06E06_143,[CLS] GILFOYLE: Based on the amount of work le...,[SEP] GILFOYLE: I thought that was the compan...,1.0


# BERT Embedding

In [3]:
from transformers import BertModel, BertTokenizer
import torch

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-cased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )
model.eval()


def get_embedding(sentence_A, sentence_B, model, tokenizer, device):
    marked_text = sentence_A + sentence_B

    # Tokenize our sentence with the BERT tokenizer.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indices.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Mark each token as belonging to sentence "1" or "0".
    segments_ids = [1 if token == "[SEP]" else 0 for token in tokenized_text]

    # Convert inputs to PyTorch tensors and send them to the same device as the model
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segments_tensors = torch.tensor([segments_ids]).to(device)

    # Predict hidden states features for each layer
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    # create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # For each token in the sentence...
    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)

    return sentence_embedding


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
from tqdm import tqdm
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Create an empty DataFrame for the embeddings
embedding_df = pd.DataFrame()

# Iterate over each row in the DataFrame and add embeddings as rows to the new DataFrame
for i, row in tqdm(df_new.iterrows(), total=df_new.shape[0]):
    sentence_A = row['SENTENCE_A']
    sentence_B = row['SENTENCE_B']
    
    embeddings = get_embedding(sentence_A, sentence_B, model, tokenizer, device)
    
    embedding_df = pd.concat([embedding_df, pd.DataFrame([embeddings.cpu().numpy()])], ignore_index=True)

# Rename the columns of the embedding DataFrame
embedding_df.columns = [f'Embedding_{i}' for i in range(embedding_df.shape[1])]

# Concatenate the embedding DataFrame with the original DataFrame
df_new = pd.concat([df_new, embedding_df], axis=1)

100%|██████████████████████████████████████████████████████████████████████████████| 1202/1202 [00:28<00:00, 42.23it/s]


In [8]:
# drop non necessary features
df_new = df_new.drop(['SENTENCE_A', 'SENTENCE_B', 'Sarcasm'], axis=1)

In [10]:
# save dataframe
df_new.to_csv('text_features_BERT.csv', index=False)

In [11]:
df_new.head()

Unnamed: 0,SCENE,Embedding_0,Embedding_1,Embedding_2,Embedding_3,Embedding_4,Embedding_5,Embedding_6,Embedding_7,Embedding_8,...,Embedding_758,Embedding_759,Embedding_760,Embedding_761,Embedding_762,Embedding_763,Embedding_764,Embedding_765,Embedding_766,Embedding_767
0,1_10004,0.400577,-0.735008,-0.09615,-0.285756,-0.357918,-0.051856,1.045135,-0.481453,-0.091696,...,0.231023,0.676469,0.125527,-0.045216,-0.050516,-0.035607,-0.009513,-0.084522,0.603833,0.59191
1,1_10009,0.387123,-0.837304,-0.0779,-0.247002,-0.358525,-0.08953,1.096665,-0.495899,-0.07485,...,0.299691,0.699883,0.094027,-0.067608,-0.036613,-0.093318,0.002613,-0.112674,0.624801,0.613933
2,1_1001,0.376343,-0.645923,-0.120487,-0.267022,-0.319736,-0.05079,0.99666,-0.409754,-0.078868,...,0.267513,0.632195,0.140556,-0.138602,-0.057374,-0.059396,-0.009998,-0.095549,0.570483,0.570835
3,1_1003,0.366896,-0.670176,-0.104165,-0.3006,-0.300686,-0.042529,0.994088,-0.427367,-0.069317,...,0.2546,0.61026,0.107198,-0.092323,-0.042515,-0.088271,-0.008831,-0.11185,0.573721,0.568279
4,1_10190,0.40507,-0.688916,-0.153265,-0.233106,-0.311845,-0.056022,0.998371,-0.423859,-0.054147,...,0.254542,0.631909,0.132351,-0.105029,-0.070631,-0.131713,-0.016279,-0.082275,0.584081,0.618788
